info_healthy <- read_xlsx ("data/raw/final_health_statistic.xlsx") #metadata of healthy subjects
info_ibs <- read_xlsx ("data/raw/final_ibs_141_statistic.xlsx") ##metadata of IBS subjects
#Combining `info_healthy` and `info_ibs` into a single dataset `combined_info` with subsequent content editing
combined_info <- info_healthy %>%
bind_rows(info_ibs) %>%
mutate (
BMI_min = ifelse (is.na (BMI_min), round (Weight_min /(Height_max/100 * Height_max/100), 2), BMI_min),
BMI_max = ifelse (is.na (BMI_max), round (Weight_max /(Height_min/100 * Height_min/100), 2), BMI_max)
) %>%
unite("BMI_range", BMI_min, BMI_max, sep = "-", na.rm = TRUE) %>%
unite ("Age_range", Age_min, Age_max, sep = "-", na.rm = TRUE) %>%
mutate(
Age_range = case_when(
Age_range == "18-40" | Age_range == "23-28" | Age_range == "16-42" | Age_range == "21-43" ~ "16-43",
Age <= 43 ~ "16-43",
Age > 43 ~ "> 43",
TRUE ~ NA_character_), #group 28-54 has been deleted
research_ID = sub ("research_", "", research_ID),
research_ID = case_when(
research_ID == 0 ~ 1,
research_ID == 1 ~ 2,
research_ID == 2 ~ 3,
research_ID == 3 ~ 4,
research_ID == 4 ~ 5,
research_ID == 6 ~ 6,
research_ID == 7 ~ 7),
patient_ID = row_number(),
Sex = ifelse (Sex == "mixed", NA, Sex),
Smoking = sub ("never", "Never", Smoking),
Smoking = case_when(
Smoking == "No" ~ "No",
Smoking == "Never" ~ "No",
Smoking == "Rarely (a few times/month)" ~ "Yes", #5 people
Smoking == "Occasionally (1-2 times/week)" ~ "Yes", #3 people
Smoking == "Regularly (3-5 times/week)" ~ "Yes", #1 people
Smoking == "Daily" ~ "Yes"), #7 чел.
Alcohol = sub ("rarely", "Rarely", Alcohol),
Alcohol = ifelse(Alcohol == "Regularly (3-5 times/week)"|
Alcohol == "Daily", #3 чел.
"Regularly (3-7 times/week)", Alcohol),
Antibiotics_usage = case_when(
Antibiotics_usage == "Month" | Antibiotics_usage == ~ "3 months" |
Antibiotics_usage == "6 months" ~ "1-6 months",
#2 IBS people - Month, 0 healthy people - 3 months, 0 healthy people - 6 months
Antibiotics_usage == "Year" | Antibiotics_usage == "Not use" ~
"12 months/Not use"), # 0 healthy people - 6-12 months, zero IBS people - Not use
Hygiene = case_when(
Hygiene == "Occasionally (1-2 times/week) cosmetics" ~ "Occasionally cosmetics (1-2 times/week)",
Hygiene == "Rarely (a few times/month) cosmetics" ~ "Rarely cosmetics (a few times/month)",
TRUE ~ Hygiene),
Hygiene = case_when(
Hygiene == "Daily cosmetics"|Hygiene == "Regularly cosmetics (3-5 times/week)" ~ "Regularly (3-7 times/week)", #0 чел. больных для 3-5 times/week
Hygiene == "Occasionally cosmetics (1-2 times/week)" | Hygiene == "Rarely cosmetics (a few times/month)" ~ "Occasionally (a few-8 times/month)",
#только 2 чел. больных для 1-2 times/week
Hygiene == "Never cosmetics" ~ "Never"),
Physical_activity = sub ("regularly", "Regularly", Smoking),
BMI = ifelse (is.na(Weight_kg), BMI, Weight_kg/ (Height_cm/100 * Height_cm/100)),
BMI_range = ifelse(BMI_range == "", NA, BMI_range),
BMI_category = case_when(
BMI_range == "18-25" ~ "normal/overweight",
BMI_range == "19.21-29.29" ~ "normal/overweight",
BMI_range == "20.6-29.6" ~ "normal/overweight",
BMI_range == "21.74-28.38" ~ "normal/overweight",
BMI_range == "18.5-30.8" ~ "normal/overweight", #a little over 30
BMI < 18.5 ~ "underweight",
BMI >= 18.5 & BMI < 30 ~ "normal/overweight",
BMI >= 30 ~ "obese")
) %>%
rename ("Cosmetics" = Hygiene ) %>%
mutate_if (is.character, as.factor) %>%
select(-c(
Instrument, # unique (combined_info$Instrument) = "Illumina MiSeq"
Isolation_source, # unique (combined_info$Isolation_source) = "faeces"
Assay_type, # unique (combined_info$Assay_type) = "AMPLICON"
Target_gene, # unique (combined_info$Target_gene) = "16S"
Main_Disease, # unique (combined_info$Main_Disease) = NA (for healthy), 141 (for IBS)
Drugs, # unique (combined_info$Drugs) = NA
Social_status, # unique (combined_info$Social_status) = NA, urban
Weight_kg, Height_cm, Weight_min, Weight_max, Height_min, Height_max, BMI_range, #have been used to create the variable "BMI_category" and to reduce the amount of NA in "BMI"
BMI, # NA for all healthy people
Birth_Year, # no additional information
Pets_type # unique (combined_info$Pets_type = cat, NA
))
rm (info_healthy, info_ibs)
#install.packages("summarytools")
library(summarytools)
saved_x11_option <- st_options("use.x11")
st_options(use.x11 = FALSE)
combined_info %>%
select(- patient_ID) %>%
dfSummary() %>%
print (method = "render")
| No | Variable | Stats / Values | Freqs (% of Valid) | Valid | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | research_ID [numeric] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 2 | Seq_region [factor] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 3 | Seq_date [numeric] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 4 | Health_state [factor] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 5 | Age [numeric] |
|
45 distinct values | 134 (35.2%) | 247 (64.8%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 6 | Age_range [factor] |
|
|
286 (75.1%) | 95 (24.9%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 7 | Sex [factor] |
|
|
191 (50.1%) | 190 (49.9%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 8 | Country [factor] |
|
|
381 (100.0%) | 0 (0.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 9 | Race [factor] |
|
|
130 (34.1%) | 251 (65.9%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 10 | Smoking [factor] |
|
|
203 (53.3%) | 178 (46.7%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 11 | Alcohol [factor] |
|
|
87 (22.8%) | 294 (77.2%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 12 | Antibiotics_usage [factor] |
|
|
177 (46.5%) | 204 (53.5%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 13 | Physical_activity [factor] |
|
|
203 (53.3%) | 178 (46.7%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 14 | Travel_period [factor] |
|
|
86 (22.6%) | 295 (77.4%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 15 | Education_level [factor] |
|
|
84 (22.0%) | 297 (78.0%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 16 | Cosmetics [factor] |
|
|
86 (22.6%) | 295 (77.4%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 17 | Sleep_duration [factor] |
|
|
87 (22.8%) | 294 (77.2%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 18 | Diet_type [factor] |
|
|
18 (4.7%) | 363 (95.3%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 19 | Diet_duration [factor] |
|
|
13 (3.4%) | 368 (96.6%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 20 | Additive_usage [factor] |
|
|
11 (2.9%) | 370 (97.1%) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 21 | BMI_category [factor] |
|
|
293 (76.9%) | 88 (23.1%) |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-02-09
combined_info %>%
select (research_ID, Country, Seq_date, Seq_region, Health_state) %>%
mutate(Country = if_else (research_ID == 4, "Australia, Austria, Germany, Greece, Hungary, Ireland, Israel, Italy, Norway, UK, USA", Country),
Seq_date = ifelse (research_ID == 4, "2018-2023", Seq_date),
Health_state = if_else (research_ID == 4, "Disease/Health", Health_state)) %>%
group_by(research_ID, Country, Seq_date, Seq_region, Health_state) %>%
summarise(n = n()) %>%
flextable() %>% theme_box() %>%
merge_v("Health_state") %>%
align(align = "center", part = "all") %>%
set_caption("General characteristics of the included studies")
research_ID | Country | Seq_date | Seq_region | Health_state | n |
|---|---|---|---|---|---|
1 | Belgium | 2018 | V5-V6 | Health | 46 |
2 | Italy | 2018 | V3-V4 | 36 | |
3 | Poland | 2023 | V3-V4 | 70 | |
4 | Australia, Austria, Germany, Greece, Hungary, Ireland, Israel, Italy, Norway, UK, USA | 2018-2023 | V4 | Disease/Health | 87 |
5 | Austria | 2017 | V4 | Disease | 25 |
6 | Israel | 2022 | V4 | 22 | |
7 | Spain | 2015 | V4 | 95 |
library(gtsummary)
combined_info %>%
select(! c(patient_ID,
Diet_duration, Additive_usage, Diet_type #for all healthy subjects = NA
)) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Health_state) %>%
add_p()
| Characteristic | Disease, N = 1701 | Health, N = 2111 | p-value2 |
|---|---|---|---|
| research_ID | <0.001 | ||
| 1 | 0 (0%) | 46 (22%) | |
| 2 | 0 (0%) | 36 (17%) | |
| 3 | 0 (0%) | 70 (33%) | |
| 4 | 28 (16%) | 59 (28%) | |
| 5 | 25 (15%) | 0 (0%) | |
| 6 | 22 (13%) | 0 (0%) | |
| 7 | 95 (56%) | 0 (0%) | |
| Seq_region | <0.001 | ||
| V3-V4 | 0 (0%) | 106 (50%) | |
| V4 | 170 (100%) | 59 (28%) | |
| V5-V6 | 0 (0%) | 46 (22%) | |
| Seq_date | <0.001 | ||
| 2015 | 95 (56%) | 0 (0%) | |
| 2017 | 25 (15%) | 0 (0%) | |
| 2018 | 14 (8%) | 82 (39%) | |
| 2020 | 3 (2%) | 36 (17%) | |
| 2021 | 9 (5%) | 23 (11%) | |
| 2022 | 23 (14%) | 0 (0%) | |
| 2023 | 1 (1%) | 70 (33%) | |
| Age | 42 (32, 50) | 28 (26, 37) | <0.001 |
| Unknown | 95 | 152 | |
| Age_range | <0.001 | ||
| > 43 | 34 (45%) | 10 (5%) | |
| 16-43 | 41 (55%) | 201 (95%) | |
| Unknown | 95 | 0 | |
| Sex | 0.15 | ||
| female | 35 (48%) | 44 (37%) | |
| male | 38 (52%) | 74 (63%) | |
| Unknown | 97 | 93 | |
| Country | |||
| Australia | 0 (0%) | 6 (3%) | |
| Austria | 25 (15%) | 3 (1%) | |
| Belgium | 0 (0%) | 46 (22%) | |
| Germany | 0 (0%) | 45 (21%) | |
| Greece | 0 (0%) | 1 (0%) | |
| Hungary | 0 (0%) | 2 (1%) | |
| Ireland | 1 (1%) | 0 (0%) | |
| Israel | 22 (13%) | 1 (0%) | |
| Italy | 0 (0%) | 37 (18%) | |
| Norway | 1 (1%) | 0 (0%) | |
| Poland | 0 (0%) | 70 (33%) | |
| Spain | 95 (56%) | 0 (0%) | |
| UK | 12 (7%) | 0 (0%) | |
| USA | 14 (8%) | 0 (0%) | |
| Race | 0.4 | ||
| African American | 0 (0%) | 1 (1%) | |
| Asian or Pacific Islander | 0 (0%) | 1 (1%) | |
| Caucasian | 27 (100%) | 89 (86%) | |
| Hispanic | 0 (0%) | 1 (1%) | |
| Other | 0 (0%) | 11 (11%) | |
| Unknown | 143 | 108 | |
| Smoking | 4 (14%) | 12 (7%) | 0.2 |
| Unknown | 142 | 36 | |
| Alcohol | 0.002 | ||
| Never | 4 (14%) | 8 (14%) | |
| Occasionally (1-2 times/week) | 4 (14%) | 20 (34%) | |
| Rarely (a few times/month) | 9 (32%) | 27 (46%) | |
| Regularly (3-7 times/week) | 11 (39%) | 4 (7%) | |
| Unknown | 142 | 152 | |
| Antibiotics_usage | 0.020 | ||
| 1-6 months | 2 (8%) | 46 (30%) | |
| 12 months/Not use | 23 (92%) | 106 (70%) | |
| Unknown | 145 | 59 | |
| Physical_activity | 4 (14%) | 12 (7%) | 0.2 |
| Unknown | 142 | 36 | |
| Travel_period | 0.066 | ||
| 3 months | 3 (11%) | 15 (26%) | |
| 6 months | 1 (4%) | 9 (16%) | |
| Month | 4 (14%) | 9 (16%) | |
| Year | 20 (71%) | 25 (43%) | |
| Unknown | 142 | 153 | |
| Education_level | <0.001 | ||
| Bachelor's level | 5 (18%) | 26 (46%) | |
| Master's level | 18 (64%) | 10 (18%) | |
| School Level | 5 (18%) | 20 (36%) | |
| Unknown | 142 | 155 | |
| Cosmetics | 0.6 | ||
| Never | 15 (56%) | 29 (49%) | |
| Occasionally (a few-8 times/month) | 6 (22%) | 11 (19%) | |
| Regularly (3-7 times/week) | 6 (22%) | 19 (32%) | |
| Unknown | 143 | 152 | |
| Sleep_duration | 0.2 | ||
| > 8 hours | 4 (14%) | 5 (8%) | |
| 4-6 hours | 4 (14%) | 5 (8%) | |
| 6-7 hours | 12 (43%) | 19 (32%) | |
| 7-8 hours | 8 (29%) | 30 (51%) | |
| Unknown | 142 | 152 | |
| BMI_category | 0.012 | ||
| normal/overweight | 135 (96%) | 152 (100%) | |
| obese | 5 (4%) | 0 (0%) | |
| underweight | 1 (1%) | 0 (0%) | |
| Unknown | 29 | 59 | |
| 1 n (%); Median (IQR) | |||
| 2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test | |||
###combined_bacteria and data_wide (not batched) datasets
bacteria_healthy <- read_csv("data/raw/final_bacteria_health.csv") #abundance table for healthy
bacteria_ibs <- read_csv("data/raw/final_bacteria_ibs_141.csv") #abundance table for healthy
combined_bacteria <- bacteria_healthy %>%
bind_rows (bacteria_ibs) %>%
mutate(patient_ID = row_number())
rm (bacteria_healthy, bacteria_ibs)
# Combining `combined_info` and `combined_bacteria` into `data_wide`
data_wide <- combined_info %>% left_join (combined_bacteria)
data_wide %>%
select (where (function(x) sum (is.na(x))/ nrow(data_wide) * 100 > 0)) %>%
sapply (function(x) sum (is.na(x))/ nrow(data_wide) * 100) %>% round(1) %>%
as.data.frame() %>%
rename(NA_percentage = ".") %>%
mutate (
"Number of people with known data" = round (nrow(data_wide) - NA_percentage/100 * nrow(data_wide)),
NA_percentage = paste (NA_percentage, "%", sep = " ")
) %>%
arrange(desc (NA_percentage)) %>%
rownames_to_column() %>%
as_tibble() %>% flextable()
rowname | NA_percentage | Number of people with known data |
|---|---|---|
Additive_usage | 97.1 % | 11 |
Diet_duration | 96.6 % | 13 |
Diet_type | 95.3 % | 18 |
Education_level | 78 % | 84 |
Travel_period | 77.4 % | 86 |
Cosmetics | 77.4 % | 86 |
Alcohol | 77.2 % | 87 |
Sleep_duration | 77.2 % | 87 |
Race | 65.9 % | 130 |
Age | 64.8 % | 134 |
Antibiotics_usage | 53.5 % | 177 |
Sex | 49.9 % | 191 |
Smoking | 46.7 % | 203 |
Physical_activity | 46.7 % | 203 |
Age_range | 24.9 % | 286 |
BMI_category | 23.1 % | 293 |
data_wide# Устанавливаем порог процента
threshold_percent <- 95
# Функция для вычисления процента записей, не равных NA и не равных 0, для каждой колонки
calculate_percentage <- function(col) {
sum(!is.na(col) & col != 0) / length(col) * 100
}
# Применяем функцию к каждой колонке в датасете
percentage_non_zero_non_na <- sapply(data_wide[, -1], calculate_percentage)
# Создаем датафрейм с результатами
result_df_sort <- data.frame(
column = names(percentage_non_zero_non_na),
percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>%
arrange(desc(percentage))
# Отфильтровываем колонки, у которых процент записей менее threshold_percent%
filtered_columns <- result_df_sort[result_df_sort$percentage < threshold_percent, ]
# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_columns,
file = "data/originals/percentage_by_vars.xlsx")
# Перезапись data_wide с выбором колонок с процентом NA/0 менее threshold_percent
data_wide <- data_wide %>%
select (row.names(filtered_columns), research_ID)
rm (calculate_percentage, result_df_sort)
data_wide# Устанавливаем порог процента
threshold_percent <- 95
# Рассчитываем процент значений, не являющихся NA и не равных 0, для каждого пациента
percentage_non_zero_non_na <- rowMeans(!is.na(data_wide) & data_wide != 0, na.rm = TRUE) * 100
# Создаем датафрейм с результатами
result_df_sort <- data.frame(
patient_id = data_wide$patient_ID,
percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% arrange(desc(percentage))
# Отфильтровываем пациентов, у которых процент значений менее threshold_percent%
filtered_patients <- result_df_sort[result_df_sort$percentage < threshold_percent, ]
# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_patients,
file = "data/originals/percentage_by_patient.xlsx")
# Перезапись data_wide с удалением строк с процентом NA/0 более threshold_percent
data_wide <- data_wide %>%
slice (filtered_patients$patient_id) #при threshold_percent = 95%, изменения data_wide не происходит, так как нет пациентов с процентом NA/0 более 95%
rm (percentage_non_zero_non_na, result_df_sort)
# Deleting columns and rows with a NA/0-percentage more than threshold_percent from `data_wide`
data_wide <- data_wide %>%
select (patient_ID, any_of (colnames(combined_info)), everything()) %>%
arrange(patient_ID)
G_only_one <- data_wide %>%
select (research_ID, ends_with("_G")) %>%
add_column(n = 1) %>% #for further counting the subjects number in each study
group_by(research_ID) %>%
summarise_each (sum) %>%
select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>% #selecting "single study taxa"
mutate (across(-c(1:2),
function(x) x/n))
G_only_one %>% as_tibble() %>% flextable()
research_ID | n | CM1G08_G | Cladosporium_G | Lentimonas_G | Micromonospora_G | Pseudosphingobacterium_G | Schizothrix LEGE 07164_G | Talaromyces_G | Rs-D38 termite group_G | Kabatiella_G | Anaerolineaceae UCG-001_G | Iamia_G | Thiohalocapsa_G | Ellin517_G |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 46 | 0.000000000 | 0.000000000 | 0.05001457 | 0.03314097 | 0.01653521 | 0.01087879 | 0.00000000 | 0.000000000 | 0.00000000 | 0.06983517 | 0.03046327 | 0.2743871 | 0.000000 |
2 | 36 | 0.000000000 | 0.008105222 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.01425858 | 0.000000000 | 0.02177045 | 0.00000000 | 0.00000000 | 0.0000000 | 0.000000 |
3 | 70 | 0.000000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.004717417 | 0.00000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.000000 |
4 | 87 | 0.005504621 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.000000 |
5 | 25 | 0.000000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.000000 |
6 | 22 | 0.000000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.000000 |
7 | 95 | 0.000000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.0000000 | 1.594206 |
#Determining the percentage of people in whom these taxa have not been detected
G_taxon <- 'Ellin517_G'
res_ID <- 7
#data_wide %>%
#select (research_ID, G_taxon) %>%
#filter (research_ID == res_ID) %>%
#summarise(across (-1,
#function (x) sum (x==0)/nrow (.) * 100)) %>%
#rename ("Taxon_zero_percentage, %" = G_taxon)
# CM1G08_G - 73,6% нулей
# Cladosporium_G - 94,0% нулей
# Lentimonas_G - 94,0% нулей
# Micromonospora_G - 93,2% нулей
# Pseudosphingobacterium_G - 93,2% нулей
# Schizothrix LEGE 07164_G - 92,7% нулей
# Talaromyces_G - 92,7% нулей
# Rs-D38 termite group_G - 91,9% нулей
# Kabatiella_G - 90,6% нулей
# Anaerolineaceae UCG-001_G - 89,5% нулей
# Iamia_G - 88,2% нулей
# Thiohalocapsa_G - 87,9% нулей
# Ellin517_G - 75,1% нулей
data_wide %>%
select (research_ID, ends_with("_F")) %>%
add_column(n = 1) %>% # for further counting the subjects number in each study
group_by(research_ID) %>%
summarise_each (sum) %>%
select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>% #selecting "single study taxa"
mutate (across(-c(1:2), function(x) x/n)) %>%
as_tibble() %>% flextable()
research_ID | n | Didymellaceae_F | Cladosporiaceae_F | Trichocomaceae_F | type III_F | 09D2Z48_F | Aureobasidiaceae_F | Iamiaceae_F |
|---|---|---|---|---|---|---|---|---|
1 | 46 | 0.00000000 | 0.000000000 | 0.00000000 | 0.01131476 | 0.01196557 | 0.00000000 | 0.03046327 |
2 | 36 | 0.02128992 | 0.008105222 | 0.01425858 | 0.00000000 | 0.00000000 | 0.02177045 | 0.00000000 |
3 | 70 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 |
4 | 87 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 |
5 | 25 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 |
6 | 22 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 |
7 | 95 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.00000000 |
#Determining the percentage of people in whom these taxa have not been detected
#F_taxon <- 'Iamiaceae_F'
#res_ID <- 1
#data_wide %>%
#select (research_ID, F_taxon) %>%
#filter (research_ID == res_ID) %>%
#summarise(across (-1,
#function (x) sum (x==0)/nrow (.) * 100)) %>%
#rename ("Taxon_zero_percentage, %" = F_taxon)
# Didymellaceae_F - 44,4% нулей
# Cladosporiaceae_F - 36,1% нулей
# Trichocomaceae_F - 22,2% нулей
# type III_F - 30,4% нулей
# 09D2Z48_F - 23,9% нулей
# Aureobasidiaceae_F - 0% нулей
# Iamiaceae_F - 2,2% нулей
data_wide %>%
select (research_ID, ends_with("_O")) %>%
add_column(n = 1) %>% #for further counting the subjects number in each study
group_by(research_ID) %>%
summarise_each (sum) %>%
select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>% #selecting "single study taxa"
mutate (across(-c(1:2), function(x) x/n)) %>%
as_tibble() %>% flextable()
research_ID | n | Hypocreales_O | Pleosporales_O | Candidatus Abawacabacteria_O | Candidatus Peregrinibacteria_O | Capnodiales_O | Candidatus Terrybacteria_O | Eurotiales_O | Dothideales_O | eub62A3_O | LD1-PA32_O |
|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 46 | 0.00000000 | 0.00000000 | 0.008918278 | 0.007617552 | 0.00000000 | 0.007831183 | 0.00000000 | 0.00000000 | 0.0130492 | 0.05113126 |
2 | 36 | 0.06219381 | 0.03669163 | 0.000000000 | 0.000000000 | 0.01202125 | 0.000000000 | 0.01425858 | 0.02680276 | 0.0000000 | 0.00000000 |
3 | 70 | 0.00000000 | 0.00000000 | 0.000000000 | 0.000000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.00000000 |
4 | 87 | 0.00000000 | 0.00000000 | 0.000000000 | 0.000000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.00000000 |
5 | 25 | 0.00000000 | 0.00000000 | 0.000000000 | 0.000000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.00000000 |
6 | 22 | 0.00000000 | 0.00000000 | 0.000000000 | 0.000000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.00000000 |
7 | 95 | 0.00000000 | 0.00000000 | 0.000000000 | 0.000000000 | 0.00000000 | 0.000000000 | 0.00000000 | 0.00000000 | 0.0000000 | 0.00000000 |
#Determining the percentage of people in whom these taxa have not been detected
#O_taxon <- 'Candidatus Abawacabacteria_O'
#res_ID <- 1
#data_wide %>%
#select (research_ID, O_taxon) %>%
#filter (research_ID == res_ID) %>%
#summarise(across (-1, function (x) sum (x==0)/nrow (.) * 100)) %>%
#rename ("Taxon_zero_percentage, %" = O_taxon)
# Hypocreales_O - 41,7% нулей
# Pleosporales_O - 41,7% нулей
# Candidatus Abawacabacteria_O - 52,2% нулей
# Candidatus Peregrinibacteria_O - 52,2% нулей
# Capnodiales_O - 33,3% нулей
# Candidatus Terrybacteria_O - 39,1% нулей
# Eurotiales_O - 39,1% нулей
# Dothideales_O - 0% нулей
# eub62A3_O - 15,2% нулей
# LD1-PA32_O - 2,2% нулей
data_wide %>%
select (research_ID, ends_with("_C")) %>%
add_column(n = 1) %>% #for further counting the subjects number in each study
group_by(research_ID) %>%
summarise_each (sum) %>%
select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>% #selecting "single study taxa"
mutate (across(-c(1:2), function(x) x/n)) %>%
as_tibble() %>% flextable()
research_ID | n | WWE3_C | Eurotiomycetes_C | Dothideomycetes_C |
|---|---|---|---|---|
1 | 46 | 0.00000000 | 0.00000000 | 0.0000000 |
2 | 36 | 0.00000000 | 0.02293748 | 0.1015485 |
3 | 70 | 0.00000000 | 0.00000000 | 0.0000000 |
4 | 87 | 0.01350327 | 0.00000000 | 0.0000000 |
5 | 25 | 0.00000000 | 0.00000000 | 0.0000000 |
6 | 22 | 0.00000000 | 0.00000000 | 0.0000000 |
7 | 95 | 0.00000000 | 0.00000000 | 0.0000000 |
#Determining the percentage of people in whom these taxa have not been detected
#С_taxon <- 'Dothideomycetes_C'
#res_ID <- 2
#data_wide %>%
#select (research_ID, С_taxon) %>%
#filter (research_ID == res_ID) %>%
#summarise(across (-1, function (x) sum (x==0)/nrow (.) * 100)) %>%
#rename ("Taxon_zero_percentage, %" = С_taxon)
# WWE3_C - 74,4% нулей
# Eurotiomycetes_C - 19,4% нулей
# Dothideomycetes_C - 0% нулей
“Single study P-taxa” have not been detected
data_wide <- data_wide %>%
select (!colnames (G_only_one) [- c (1,2)]) #deleting of single study G-taxa (in this single study the taxa were present in no more than 25% of subjects)
data_wide_not_batched <- data_wide
write_rds(data_wide,
file = "data/originals/data_wide_not_batched.rds")
clusters_number <- 7 #number of clusters
library(cluster)
library(factoextra)
G_scaled <- data_wide %>%
mutate(patient_ID = paste0("patient_", patient_ID)) %>%
column_to_rownames("patient_ID") %>%
select (ends_with("_G")) %>%
scale ()
agnes <- G_scaled %>%
dist (method = "euclidean") %>%
as.matrix() %>%
agnes( #Agglomerative clustering
diss = TRUE, #dissimilarity matrix
method = "ward")
fviz_dend(agnes,
k = clusters_number,
rect = TRUE,
k_colors = "jco",
show_labels = TRUE,
label_cols = ifelse (data_wide[agnes$order,]$Health_state == "Health", "green", "red"),
#labels colors = Health_state
cex = 0.2,
main = "Cluster dengrogram based on G-taxa abundance before batch effect correction",
ylab = "")
data_wide %>%
add_column("Cluster" = cutree(agnes, k = clusters_number)) %>%
select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Cluster) %>%
add_p()
| Characteristic | 1, N = 461 | 2, N = 311 | 3, N = 1891 | 4, N = 231 | 5, N = 651 | 6, N = 21 | 7, N = 251 | p-value2 |
|---|---|---|---|---|---|---|---|---|
| research_ID | ||||||||
| 1 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2 | 0 (0%) | 31 (100%) | 4 (2%) | 1 (4%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 3 | 0 (0%) | 0 (0%) | 2 (1%) | 3 (13%) | 65 (100%) | 0 (0%) | 0 (0%) | |
| 4 | 0 (0%) | 0 (0%) | 66 (35%) | 19 (83%) | 0 (0%) | 2 (100%) | 0 (0%) | |
| 5 | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 25 (100%) | |
| 6 | 0 (0%) | 0 (0%) | 22 (12%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 7 | 0 (0%) | 0 (0%) | 95 (50%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Seq_region | ||||||||
| V3-V4 | 0 (0%) | 31 (100%) | 6 (3%) | 4 (17%) | 65 (100%) | 0 (0%) | 0 (0%) | |
| V4 | 0 (0%) | 0 (0%) | 183 (97%) | 19 (83%) | 0 (0%) | 2 (100%) | 25 (100%) | |
| V5-V6 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Seq_date | ||||||||
| 2015 | 0 (0%) | 0 (0%) | 95 (50%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2017 | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 25 (100%) | |
| 2018 | 46 (100%) | 31 (100%) | 11 (6%) | 7 (30%) | 0 (0%) | 1 (50%) | 0 (0%) | |
| 2020 | 0 (0%) | 0 (0%) | 36 (19%) | 2 (9%) | 0 (0%) | 1 (50%) | 0 (0%) | |
| 2021 | 0 (0%) | 0 (0%) | 21 (11%) | 11 (48%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2022 | 0 (0%) | 0 (0%) | 23 (12%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2023 | 0 (0%) | 0 (0%) | 3 (2%) | 3 (13%) | 65 (100%) | 0 (0%) | 0 (0%) | |
| Health_state | ||||||||
| Disease | 0 (0%) | 0 (0%) | 134 (71%) | 10 (43%) | 0 (0%) | 1 (50%) | 25 (100%) | |
| Health | 46 (100%) | 31 (100%) | 55 (29%) | 13 (57%) | 65 (100%) | 1 (50%) | 0 (0%) | |
| Age | NA (NA, NA) | NA (NA, NA) | 32 (27, 42) | 49 (43, 52) | NA (NA, NA) | 32 (32, 33) | 39 (28, 49) | 0.002 |
| Unknown | 46 | 31 | 101 | 4 | 65 | 0 | 0 | |
| Age_range | ||||||||
| > 43 | 0 (0%) | 0 (0%) | 21 (22%) | 13 (57%) | 0 (0%) | 0 (0%) | 10 (40%) | |
| 16-43 | 46 (100%) | 31 (100%) | 73 (78%) | 10 (43%) | 65 (100%) | 2 (100%) | 15 (60%) | |
| Unknown | 0 | 0 | 95 | 0 | 0 | 0 | 0 | |
| Sex | <0.001 | |||||||
| female | 0 (NA%) | 0 (0%) | 20 (42%) | 5 (24%) | 38 (58%) | 0 (0%) | 16 (64%) | |
| male | 0 (NA%) | 31 (100%) | 28 (58%) | 16 (76%) | 27 (42%) | 1 (100%) | 9 (36%) | |
| Unknown | 46 | 0 | 141 | 2 | 0 | 1 | 0 | |
| Country | ||||||||
| Australia | 0 (0%) | 0 (0%) | 2 (1%) | 4 (17%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Austria | 0 (0%) | 0 (0%) | 0 (0%) | 2 (9%) | 0 (0%) | 1 (50%) | 25 (100%) | |
| Belgium | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Germany | 0 (0%) | 0 (0%) | 45 (24%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Greece | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Hungary | 0 (0%) | 0 (0%) | 0 (0%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Ireland | 0 (0%) | 0 (0%) | 0 (0%) | 1 (4%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Israel | 0 (0%) | 0 (0%) | 23 (12%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Italy | 0 (0%) | 31 (100%) | 4 (2%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Norway | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Poland | 0 (0%) | 0 (0%) | 2 (1%) | 3 (13%) | 65 (100%) | 0 (0%) | 0 (0%) | |
| Spain | 0 (0%) | 0 (0%) | 95 (50%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| UK | 0 (0%) | 0 (0%) | 8 (4%) | 3 (13%) | 0 (0%) | 1 (50%) | 0 (0%) | |
| USA | 0 (0%) | 0 (0%) | 8 (4%) | 6 (26%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Race | 0.016 | |||||||
| African American | 0 (0%) | 0 (NA%) | 1 (2%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Asian or Pacific Islander | 0 (0%) | 0 (NA%) | 1 (2%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Caucasian | 46 (100%) | 0 (NA%) | 49 (78%) | 19 (100%) | 0 (NA%) | 2 (100%) | 0 (NA%) | |
| Hispanic | 0 (0%) | 0 (NA%) | 1 (2%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Other | 0 (0%) | 0 (NA%) | 11 (17%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Unknown | 0 | 31 | 126 | 4 | 65 | 0 | 25 | |
| Smoking | 0 (0%) | 0 (NA%) | 14 (21%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (NA%) | <0.001 |
| Unknown | 0 | 31 | 121 | 1 | 0 | 0 | 25 | |
| Alcohol | 0.8 | |||||||
| Never | 0 (NA%) | 0 (NA%) | 8 (12%) | 4 (21%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Occasionally (1-2 times/week) | 0 (NA%) | 0 (NA%) | 20 (30%) | 4 (21%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Rarely (a few times/month) | 0 (NA%) | 0 (NA%) | 26 (39%) | 8 (42%) | 0 (NA%) | 2 (100%) | 0 (NA%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 0 (NA%) | 12 (18%) | 3 (16%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 31 | 123 | 4 | 65 | 0 | 25 | |
| Antibiotics_usage | <0.001 | |||||||
| 1-6 months | 46 (100%) | 0 (0%) | 1 (5%) | 1 (8%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| 12 months/Not use | 0 (0%) | 31 (100%) | 21 (95%) | 11 (92%) | 65 (100%) | 1 (100%) | 0 (NA%) | |
| Unknown | 0 | 0 | 167 | 11 | 0 | 1 | 25 | |
| Physical_activity | 0 (0%) | 0 (NA%) | 14 (21%) | 2 (9%) | 0 (0%) | 0 (0%) | 0 (NA%) | <0.001 |
| Unknown | 0 | 31 | 121 | 1 | 0 | 0 | 25 | |
| Travel_period | 0.019 | |||||||
| 3 months | 0 (NA%) | 0 (NA%) | 12 (18%) | 6 (33%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| 6 months | 0 (NA%) | 0 (NA%) | 10 (15%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Month | 0 (NA%) | 0 (NA%) | 7 (11%) | 4 (22%) | 0 (NA%) | 2 (100%) | 0 (NA%) | |
| Year | 0 (NA%) | 0 (NA%) | 37 (56%) | 8 (44%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 31 | 123 | 5 | 65 | 0 | 25 | |
| Education_level | 0.006 | |||||||
| Bachelor's level | 0 (NA%) | 0 (NA%) | 26 (41%) | 5 (28%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Master's level | 0 (NA%) | 0 (NA%) | 15 (23%) | 11 (61%) | 0 (NA%) | 2 (100%) | 0 (NA%) | |
| School Level | 0 (NA%) | 0 (NA%) | 23 (36%) | 2 (11%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Unknown | 46 | 31 | 125 | 5 | 65 | 0 | 25 | |
| Cosmetics | 0.2 | |||||||
| Never | 0 (NA%) | 0 (NA%) | 30 (46%) | 13 (68%) | 0 (NA%) | 1 (50%) | 0 (NA%) | |
| Occasionally (a few-8 times/month) | 0 (NA%) | 0 (NA%) | 16 (25%) | 1 (5%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 0 (NA%) | 19 (29%) | 5 (26%) | 0 (NA%) | 1 (50%) | 0 (NA%) | |
| Unknown | 46 | 31 | 124 | 4 | 65 | 0 | 25 | |
| Sleep_duration | 0.2 | |||||||
| > 8 hours | 0 (NA%) | 0 (NA%) | 6 (9%) | 3 (16%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| 4-6 hours | 0 (NA%) | 0 (NA%) | 5 (8%) | 4 (21%) | 0 (NA%) | 0 (0%) | 0 (NA%) | |
| 6-7 hours | 0 (NA%) | 0 (NA%) | 22 (33%) | 8 (42%) | 0 (NA%) | 1 (50%) | 0 (NA%) | |
| 7-8 hours | 0 (NA%) | 0 (NA%) | 33 (50%) | 4 (21%) | 0 (NA%) | 1 (50%) | 0 (NA%) | |
| Unknown | 46 | 31 | 123 | 4 | 65 | 0 | 25 | |
| BMI_category | 0.10 | |||||||
| normal/overweight | 46 (100%) | 31 (100%) | 133 (97%) | 11 (85%) | 65 (100%) | 1 (100%) | 0 (NA%) | |
| obese | 0 (0%) | 0 (0%) | 3 (2%) | 2 (15%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| underweight | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | |
| Unknown | 0 | 0 | 52 | 10 | 0 | 1 | 25 | |
| 1 n (%); Median (IQR) | ||||||||
| 2 Kruskal-Wallis rank sum test; Fisher’s exact test | ||||||||
agnes2 <- data_wide %>%
column_to_rownames("patient_ID") %>%
select (ends_with("_G")) %>%
vegdist(method = "bray") %>% #distance matrix for Bray-Curtis dissimilarity:
#the Bray–Curtis dissimilarity is bounded between 0 and 1, where 0 means the two sites have the same composition , and 1 means the two sites do not share any species
as.matrix() %>%
agnes( #Agglomerative clustering
diss = TRUE, #dissimilarity matrix
method = "ward")
fviz_dend(agnes2,
cex = 0.2, k = clusters_number,
rect = TRUE,
k_colors = "jco",
color_labels_by_k = FALSE,
label_cols = ifelse (data_wide[agnes2$order,]$Health_state == "Health", "green", "red"),
#labels colors = Health_state
main = "Cluster dengrogram based on G-taxa abundance before batch effect correction",
ylab = "")
fisher.test.simulate.p.values <- function(data, variable, by, ...) {
result <- list()
test_results <- stats::fisher.test(data[[variable]], data[[by]], simulate.p.value = TRUE)
result$p <- test_results$p.value
result$test <- test_results$method
result
}
data_wide %>%
add_column("Cluster" = cutree(agnes2, k = clusters_number)) %>%
select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Cluster) %>%
add_p(test = list(all_categorical() ~ "fisher.test.simulate.p.values"))
| Characteristic | 1, N = 461 | 2, N = 701 | 3, N = 1211 | 4, N = 411 | 5, N = 461 | 6, N = 211 | 7, N = 361 | p-value2 |
|---|---|---|---|---|---|---|---|---|
| research_ID | <0.001 | |||||||
| 1 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2 | 0 (0%) | 19 (27%) | 5 (4%) | 8 (20%) | 3 (7%) | 1 (5%) | 0 (0%) | |
| 3 | 0 (0%) | 25 (36%) | 26 (21%) | 8 (20%) | 9 (20%) | 0 (0%) | 2 (6%) | |
| 4 | 0 (0%) | 6 (9%) | 36 (30%) | 0 (0%) | 9 (20%) | 20 (95%) | 16 (44%) | |
| 5 | 0 (0%) | 2 (3%) | 10 (8%) | 0 (0%) | 0 (0%) | 0 (0%) | 13 (36%) | |
| 6 | 0 (0%) | 4 (6%) | 9 (7%) | 3 (7%) | 4 (9%) | 0 (0%) | 2 (6%) | |
| 7 | 0 (0%) | 14 (20%) | 35 (29%) | 22 (54%) | 21 (46%) | 0 (0%) | 3 (8%) | |
| Seq_region | <0.001 | |||||||
| V3-V4 | 0 (0%) | 44 (63%) | 31 (26%) | 16 (39%) | 12 (26%) | 1 (5%) | 2 (6%) | |
| V4 | 0 (0%) | 26 (37%) | 90 (74%) | 25 (61%) | 34 (74%) | 20 (95%) | 34 (94%) | |
| V5-V6 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Seq_date | <0.001 | |||||||
| 2015 | 0 (0%) | 14 (20%) | 35 (29%) | 22 (54%) | 21 (46%) | 0 (0%) | 3 (8%) | |
| 2017 | 0 (0%) | 2 (3%) | 10 (8%) | 0 (0%) | 0 (0%) | 0 (0%) | 13 (36%) | |
| 2018 | 46 (100%) | 21 (30%) | 8 (7%) | 8 (20%) | 5 (11%) | 7 (33%) | 1 (3%) | |
| 2020 | 0 (0%) | 2 (3%) | 20 (17%) | 0 (0%) | 5 (11%) | 2 (10%) | 10 (28%) | |
| 2021 | 0 (0%) | 2 (3%) | 11 (9%) | 0 (0%) | 2 (4%) | 12 (57%) | 5 (14%) | |
| 2022 | 0 (0%) | 4 (6%) | 10 (8%) | 3 (7%) | 4 (9%) | 0 (0%) | 2 (6%) | |
| 2023 | 0 (0%) | 25 (36%) | 27 (22%) | 8 (20%) | 9 (20%) | 0 (0%) | 2 (6%) | |
| Health_state | <0.001 | |||||||
| Disease | 0 (0%) | 23 (33%) | 63 (52%) | 25 (61%) | 29 (63%) | 10 (48%) | 20 (56%) | |
| Health | 46 (100%) | 47 (67%) | 58 (48%) | 16 (39%) | 17 (37%) | 11 (52%) | 16 (44%) | |
| Age | NA (NA, NA) | 36 (29, 47) | 33 (27, 48) | 35 (30, 41) | 32 (29, 38) | 48 (42, 56) | 31 (26, 41) | 0.010 |
| Unknown | 46 | 58 | 66 | 38 | 33 | 1 | 5 | |
| Age_range | <0.001 | |||||||
| > 43 | 0 (0%) | 5 (9%) | 15 (17%) | 1 (5%) | 3 (12%) | 13 (62%) | 7 (21%) | |
| 16-43 | 46 (100%) | 51 (91%) | 71 (83%) | 18 (95%) | 22 (88%) | 8 (38%) | 26 (79%) | |
| Unknown | 0 | 14 | 35 | 22 | 21 | 0 | 3 | |
| Sex | 0.027 | |||||||
| female | 0 (NA%) | 18 (34%) | 36 (59%) | 4 (21%) | 7 (37%) | 6 (32%) | 8 (40%) | |
| male | 0 (NA%) | 35 (66%) | 25 (41%) | 15 (79%) | 12 (63%) | 13 (68%) | 12 (60%) | |
| Unknown | 46 | 17 | 60 | 22 | 27 | 2 | 16 | |
| Country | <0.001 | |||||||
| Australia | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 4 (19%) | 1 (3%) | |
| Austria | 0 (0%) | 2 (3%) | 10 (8%) | 0 (0%) | 0 (0%) | 2 (10%) | 14 (39%) | |
| Belgium | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Germany | 0 (0%) | 3 (4%) | 25 (21%) | 0 (0%) | 5 (11%) | 0 (0%) | 12 (33%) | |
| Greece | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (5%) | 0 (0%) | |
| Hungary | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 2 (10%) | 0 (0%) | |
| Ireland | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (5%) | 0 (0%) | |
| Israel | 0 (0%) | 4 (6%) | 10 (8%) | 3 (7%) | 4 (9%) | 0 (0%) | 2 (6%) | |
| Italy | 0 (0%) | 19 (27%) | 5 (4%) | 8 (20%) | 3 (7%) | 2 (10%) | 0 (0%) | |
| Norway | 0 (0%) | 0 (0%) | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Poland | 0 (0%) | 25 (36%) | 26 (21%) | 8 (20%) | 9 (20%) | 0 (0%) | 2 (6%) | |
| Spain | 0 (0%) | 14 (20%) | 35 (29%) | 22 (54%) | 21 (46%) | 0 (0%) | 3 (8%) | |
| UK | 0 (0%) | 2 (3%) | 1 (1%) | 0 (0%) | 4 (9%) | 4 (19%) | 1 (3%) | |
| USA | 0 (0%) | 1 (1%) | 7 (6%) | 0 (0%) | 0 (0%) | 5 (24%) | 1 (3%) | |
| Race | 0.001 | |||||||
| African American | 0 (0%) | 0 (0%) | 1 (3%) | 0 (NA%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Asian or Pacific Islander | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | 1 (11%) | 0 (0%) | 0 (0%) | |
| Caucasian | 46 (100%) | 6 (100%) | 28 (80%) | 0 (NA%) | 7 (78%) | 20 (100%) | 9 (64%) | |
| Hispanic | 0 (0%) | 0 (0%) | 0 (0%) | 0 (NA%) | 0 (0%) | 0 (0%) | 1 (7%) | |
| Other | 0 (0%) | 0 (0%) | 6 (17%) | 0 (NA%) | 1 (11%) | 0 (0%) | 4 (29%) | |
| Unknown | 0 | 64 | 86 | 41 | 37 | 1 | 22 | |
| Smoking | 0 (0%) | 2 (6%) | 7 (11%) | 0 (0%) | 1 (6%) | 3 (15%) | 3 (17%) | 0.090 |
| Unknown | 0 | 39 | 59 | 33 | 28 | 1 | 18 | |
| Alcohol | 0.7 | |||||||
| Never | 0 (NA%) | 0 (0%) | 5 (14%) | 0 (NA%) | 1 (11%) | 5 (25%) | 1 (6%) | |
| Occasionally (1-2 times/week) | 0 (NA%) | 1 (17%) | 12 (33%) | 0 (NA%) | 1 (11%) | 4 (20%) | 6 (38%) | |
| Rarely (a few times/month) | 0 (NA%) | 4 (67%) | 12 (33%) | 0 (NA%) | 4 (44%) | 8 (40%) | 8 (50%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 1 (17%) | 7 (19%) | 0 (NA%) | 3 (33%) | 3 (15%) | 1 (6%) | |
| Unknown | 46 | 64 | 85 | 41 | 37 | 1 | 20 | |
| Antibiotics_usage | <0.001 | |||||||
| 1-6 months | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (11%) | 1 (25%) | |
| 12 months/Not use | 0 (0%) | 47 (100%) | 39 (100%) | 16 (100%) | 16 (100%) | 8 (89%) | 3 (75%) | |
| Unknown | 0 | 23 | 82 | 25 | 30 | 12 | 32 | |
| Physical_activity | 0 (0%) | 2 (6%) | 7 (11%) | 0 (0%) | 1 (6%) | 3 (15%) | 3 (17%) | 0.069 |
| Unknown | 0 | 39 | 59 | 33 | 28 | 1 | 18 | |
| Travel_period | 0.056 | |||||||
| 3 months | 0 (NA%) | 0 (0%) | 10 (28%) | 0 (NA%) | 1 (11%) | 5 (26%) | 2 (13%) | |
| 6 months | 0 (NA%) | 0 (0%) | 8 (22%) | 0 (NA%) | 1 (11%) | 0 (0%) | 1 (6%) | |
| Month | 0 (NA%) | 0 (0%) | 2 (6%) | 0 (NA%) | 1 (11%) | 4 (21%) | 6 (38%) | |
| Year | 0 (NA%) | 6 (100%) | 16 (44%) | 0 (NA%) | 6 (67%) | 10 (53%) | 7 (44%) | |
| Unknown | 46 | 64 | 85 | 41 | 37 | 2 | 20 | |
| Education_level | 0.2 | |||||||
| Bachelor's level | 0 (NA%) | 1 (20%) | 11 (31%) | 0 (NA%) | 3 (33%) | 6 (32%) | 10 (63%) | |
| Master's level | 0 (NA%) | 1 (20%) | 10 (29%) | 0 (NA%) | 4 (44%) | 10 (53%) | 3 (19%) | |
| School Level | 0 (NA%) | 3 (60%) | 14 (40%) | 0 (NA%) | 2 (22%) | 3 (16%) | 3 (19%) | |
| Unknown | 46 | 65 | 86 | 41 | 37 | 2 | 20 | |
| Cosmetics | 0.7 | |||||||
| Never | 0 (NA%) | 3 (50%) | 18 (50%) | 0 (NA%) | 4 (50%) | 12 (60%) | 7 (44%) | |
| Occasionally (a few-8 times/month) | 0 (NA%) | 2 (33%) | 6 (17%) | 0 (NA%) | 3 (38%) | 2 (10%) | 4 (25%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 1 (17%) | 12 (33%) | 0 (NA%) | 1 (13%) | 6 (30%) | 5 (31%) | |
| Unknown | 46 | 64 | 85 | 41 | 38 | 1 | 20 | |
| Sleep_duration | 0.2 | |||||||
| > 8 hours | 0 (NA%) | 2 (33%) | 2 (6%) | 0 (NA%) | 1 (11%) | 3 (15%) | 1 (6%) | |
| 4-6 hours | 0 (NA%) | 0 (0%) | 2 (6%) | 0 (NA%) | 1 (11%) | 4 (20%) | 2 (13%) | |
| 6-7 hours | 0 (NA%) | 2 (33%) | 12 (33%) | 0 (NA%) | 1 (11%) | 9 (45%) | 7 (44%) | |
| 7-8 hours | 0 (NA%) | 2 (33%) | 20 (56%) | 0 (NA%) | 6 (67%) | 4 (20%) | 6 (38%) | |
| Unknown | 46 | 64 | 85 | 41 | 37 | 1 | 20 | |
| BMI_category | 0.033 | |||||||
| normal/overweight | 46 (100%) | 64 (98%) | 80 (98%) | 41 (100%) | 39 (98%) | 8 (80%) | 9 (100%) | |
| obese | 0 (0%) | 1 (2%) | 2 (2%) | 0 (0%) | 0 (0%) | 2 (20%) | 0 (0%) | |
| underweight | 0 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | 1 (3%) | 0 (0%) | 0 (0%) | |
| Unknown | 0 | 5 | 39 | 0 | 6 | 11 | 27 | |
| 1 n (%); Median (IQR) | ||||||||
| 2 Fisher’s Exact Test for Count Data with simulated p-value (based on 2000 replicates); Kruskal-Wallis rank sum test | ||||||||
kmeans <- G_scaled %>%
kmeans(centers = clusters_number,
iter.max = 20, nstart = 35)
fviz_cluster(kmeans,data = G_scaled,
ellipse = FALSE,
show.clust.cent = FALSE,
geom = "point",
main = "kMeans clustering based on G-taxa abundance before batch effect correction")
the MBECS package was used, a step-by-step actions description is available at the link https://github.com/rmolbrich/MBECS
Preliminary report is available in /data/originals/.
#Run corrections
mbec.obj <- mbecRunCorrections(
mbec.obj,
model.vars=c('research_ID', 'Health_state'),
#model.vars=c (batch effect, presumed biological effect of interest)
method = "bat", #batch effect correcting algorithm:
#Batch Mean Centering (bmc) /ComBat (bat)/Remove Batch Effect (rbe)
type = "otu") #which abundance matrix to use
## Found 459 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
#Post report
#mbecReportPost(
#input.obj=mbec.obj,
#model.vars=c('research_ID', 'Health_state'), #model.vars=c (batch effect, presumed biological effect of interest)
#type="otu", # which abundance matrix to use for evaluation: clr/otu/tss
#file.name = "Batch_(research_id)_Correction_Report_otu_bat",
#file.dir = "data/originals/",
#return.data = FALSE)
library (datawizard)
#Retrieve corrrected data
ps.cor.bat <- mbecGetPhyloseq(mbec.obj,
type="cor",
label="bat") #which type of data to add (correction)
combined_bacteria_batched <- ps.cor.bat@otu_table %>%
as.data.frame() %>%
data_rotate (rownames = "patient_ID") %>%
mutate (patient_ID = sub ("S", "", patient_ID),
patient_ID = as.numeric(patient_ID))
data_wide_batched <- combined_info %>%
select (where (function(x)
sum (!is.na(x))/ nrow(.) * 100 >= 5)) %>% #selecting of variables with less than 95% NA
left_join(combined_bacteria_batched)
rm (combined_bacteria_batched)
####Principal Component Analysis
plot <- mbecPCA(input.obj=mbec.obj,
model.vars=c('research_ID', 'Health_state'),
type="otu", pca.axes=c(1,2), return.data = FALSE)
ggsave("data/pictures/PCA_befor_batching.jpeg", plot = plot)
plot <- mbecPCA(input.obj=mbec.obj,
model.vars=c('research_ID', 'Health_state'),
type="cor", label = "bat",
pca.axes=c(1,2), return.data = FALSE)
ggsave("data/pictures/PCA_after_batching.jpeg", plot = plot)
####Heatmap
mbecHeat(input.obj=mbec.obj, method = "TOP", n = 10,
model.vars=c('research_ID', 'Health_state'),
center = TRUE, scale = TRUE,
type="otu",
return.data = FALSE)
mbecHeat(input.obj=mbec.obj, method = "TOP", n = 10,
model.vars=c('research_ID', 'Health_state'),
center = TRUE, scale = TRUE,
type="cor", label = "bat",
return.data = FALSE)
clusters_number <- 4 #количество кластеров
library(cluster)
library(factoextra)
G_scaled <- data_wide_batched %>%
mutate(patient_ID = paste0("patient_", patient_ID)) %>%
column_to_rownames("patient_ID") %>%
select (ends_with("_G")) %>% #выбираем только G-таксоны
scale () #нормируем данные
agnes <- G_scaled %>%
dist (method = "euclidean") %>% #cоздаём матрицу дистанций
as.matrix() %>%
agnes( #выбираем Agglomerative clustering (иерархическая кластеризация снизу вверх)
diss = TRUE, #на вход подана dissimilarity matrix
method = "ward")
fviz_dend(agnes,
k = clusters_number,
rect = TRUE,
k_colors = "jco",
show_labels = TRUE,
label_cols = ifelse (data_wide_batched[agnes$order,]$Health_state == "Health", "green", "red"),#цвет лейблов определяется по Health_state
cex = 0.2, #размер шрифта лейблов
main = "Кластеризация субъектов по содержанию G-таксонов после batch-коррекции",
ylab = "Высота")
data_wide %>%
add_column("Cluster" = cutree(agnes, k = clusters_number)) %>% # Создаём новый столбец - вектор принадлежности к кластерам
select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Cluster) %>%
add_p()
| Characteristic | 1, N = 461 | 2, N = 2881 | 3, N = 221 | 4, N = 251 | p-value2 |
|---|---|---|---|---|---|
| research_ID | |||||
| 1 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| 2 | 0 (0%) | 35 (12%) | 1 (5%) | 0 (0%) | |
| 3 | 0 (0%) | 66 (23%) | 4 (18%) | 0 (0%) | |
| 4 | 0 (0%) | 69 (24%) | 16 (73%) | 2 (8%) | |
| 5 | 0 (0%) | 2 (1%) | 0 (0%) | 23 (92%) | |
| 6 | 0 (0%) | 22 (8%) | 0 (0%) | 0 (0%) | |
| 7 | 0 (0%) | 94 (33%) | 1 (5%) | 0 (0%) | |
| Seq_region | |||||
| V3-V4 | 0 (0%) | 101 (35%) | 5 (23%) | 0 (0%) | |
| V4 | 0 (0%) | 187 (65%) | 17 (77%) | 25 (100%) | |
| V5-V6 | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Seq_date | |||||
| 2015 | 0 (0%) | 94 (33%) | 1 (5%) | 0 (0%) | |
| 2017 | 0 (0%) | 2 (1%) | 0 (0%) | 23 (92%) | |
| 2018 | 46 (100%) | 43 (15%) | 6 (27%) | 1 (4%) | |
| 2020 | 0 (0%) | 36 (13%) | 2 (9%) | 1 (4%) | |
| 2021 | 0 (0%) | 23 (8%) | 9 (41%) | 0 (0%) | |
| 2022 | 0 (0%) | 23 (8%) | 0 (0%) | 0 (0%) | |
| 2023 | 0 (0%) | 67 (23%) | 4 (18%) | 0 (0%) | |
| Health_state | <0.001 | ||||
| Disease | 0 (0%) | 137 (48%) | 9 (41%) | 24 (96%) | |
| Health | 46 (100%) | 151 (52%) | 13 (59%) | 1 (4%) | |
| Age | NA (NA, NA) | 32 (27, 46) | 48 (42, 51) | 34 (28, 48) | 0.010 |
| Unknown | 46 | 195 | 6 | 0 | |
| Age_range | <0.001 | ||||
| > 43 | 0 (0%) | 24 (12%) | 11 (52%) | 9 (36%) | |
| 16-43 | 46 (100%) | 170 (88%) | 10 (48%) | 16 (64%) | |
| Unknown | 0 | 94 | 1 | 0 | |
| Sex | 0.11 | ||||
| female | 0 (NA%) | 60 (41%) | 5 (26%) | 14 (58%) | |
| male | 0 (NA%) | 88 (59%) | 14 (74%) | 10 (42%) | |
| Unknown | 46 | 140 | 3 | 1 | |
| Country | |||||
| Australia | 0 (0%) | 3 (1%) | 3 (14%) | 0 (0%) | |
| Austria | 0 (0%) | 2 (1%) | 2 (9%) | 24 (96%) | |
| Belgium | 46 (100%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Germany | 0 (0%) | 45 (16%) | 0 (0%) | 0 (0%) | |
| Greece | 0 (0%) | 1 (0%) | 0 (0%) | 0 (0%) | |
| Hungary | 0 (0%) | 0 (0%) | 2 (9%) | 0 (0%) | |
| Ireland | 0 (0%) | 0 (0%) | 1 (5%) | 0 (0%) | |
| Israel | 0 (0%) | 23 (8%) | 0 (0%) | 0 (0%) | |
| Italy | 0 (0%) | 35 (12%) | 2 (9%) | 0 (0%) | |
| Norway | 0 (0%) | 1 (0%) | 0 (0%) | 0 (0%) | |
| Poland | 0 (0%) | 66 (23%) | 4 (18%) | 0 (0%) | |
| Spain | 0 (0%) | 94 (33%) | 1 (5%) | 0 (0%) | |
| UK | 0 (0%) | 8 (3%) | 3 (14%) | 1 (4%) | |
| USA | 0 (0%) | 10 (3%) | 4 (18%) | 0 (0%) | |
| Race | 0.029 | ||||
| African American | 0 (0%) | 1 (2%) | 0 (0%) | 0 (0%) | |
| Asian or Pacific Islander | 0 (0%) | 1 (2%) | 0 (0%) | 0 (0%) | |
| Caucasian | 46 (100%) | 52 (79%) | 16 (100%) | 2 (100%) | |
| Hispanic | 0 (0%) | 1 (2%) | 0 (0%) | 0 (0%) | |
| Other | 0 (0%) | 11 (17%) | 0 (0%) | 0 (0%) | |
| Unknown | 0 | 222 | 6 | 23 | |
| Smoking | 0 (0%) | 15 (11%) | 1 (5%) | 0 (0%) | 0.060 |
| Unknown | 0 | 153 | 2 | 23 | |
| Alcohol | >0.9 | ||||
| Never | 0 (NA%) | 10 (14%) | 2 (13%) | 0 (0%) | |
| Occasionally (1-2 times/week) | 0 (NA%) | 20 (29%) | 4 (25%) | 0 (0%) | |
| Rarely (a few times/month) | 0 (NA%) | 26 (38%) | 8 (50%) | 2 (100%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 13 (19%) | 2 (13%) | 0 (0%) | |
| Unknown | 46 | 219 | 6 | 23 | |
| Antibiotics_usage | <0.001 | ||||
| 1-6 months | 46 (100%) | 1 (1%) | 1 (9%) | 0 (0%) | |
| 12 months/Not use | 0 (0%) | 118 (99%) | 10 (91%) | 1 (100%) | |
| Unknown | 0 | 169 | 11 | 24 | |
| Physical_activity | 0 (0%) | 15 (11%) | 1 (5%) | 0 (0%) | 0.060 |
| Unknown | 0 | 153 | 2 | 23 | |
| Travel_period | 0.016 | ||||
| 3 months | 0 (NA%) | 13 (19%) | 5 (33%) | 0 (0%) | |
| 6 months | 0 (NA%) | 10 (14%) | 0 (0%) | 0 (0%) | |
| Month | 0 (NA%) | 7 (10%) | 4 (27%) | 2 (100%) | |
| Year | 0 (NA%) | 39 (57%) | 6 (40%) | 0 (0%) | |
| Unknown | 46 | 219 | 7 | 23 | |
| Education_level | 0.070 | ||||
| Bachelor's level | 0 (NA%) | 26 (39%) | 5 (33%) | 0 (0%) | |
| Master's level | 0 (NA%) | 18 (27%) | 8 (53%) | 2 (100%) | |
| School Level | 0 (NA%) | 23 (34%) | 2 (13%) | 0 (0%) | |
| Unknown | 46 | 221 | 7 | 23 | |
| Cosmetics | 0.4 | ||||
| Never | 0 (NA%) | 32 (47%) | 11 (69%) | 1 (50%) | |
| Occasionally (a few-8 times/month) | 0 (NA%) | 16 (24%) | 1 (6%) | 0 (0%) | |
| Regularly (3-7 times/week) | 0 (NA%) | 20 (29%) | 4 (25%) | 1 (50%) | |
| Unknown | 46 | 220 | 6 | 23 | |
| Sleep_duration | 0.14 | ||||
| > 8 hours | 0 (NA%) | 7 (10%) | 2 (13%) | 0 (0%) | |
| 4-6 hours | 0 (NA%) | 5 (7%) | 4 (25%) | 0 (0%) | |
| 6-7 hours | 0 (NA%) | 23 (33%) | 7 (44%) | 1 (50%) | |
| 7-8 hours | 0 (NA%) | 34 (49%) | 3 (19%) | 1 (50%) | |
| Unknown | 46 | 219 | 6 | 23 | |
| BMI_category | 0.063 | ||||
| normal/overweight | 46 (100%) | 229 (98%) | 11 (85%) | 1 (100%) | |
| obese | 0 (0%) | 3 (1%) | 2 (15%) | 0 (0%) | |
| underweight | 0 (0%) | 1 (0%) | 0 (0%) | 0 (0%) | |
| Unknown | 0 | 55 | 9 | 24 | |
| 1 n (%); Median (IQR) | |||||
| 2 Pearson’s Chi-squared test; Kruskal-Wallis rank sum test; Fisher’s exact test | |||||
kmeans <- G_scaled %>%
kmeans(centers = clusters_number, # Кол-во кластеров
iter.max = 20, # Максимальное кол-во итераций
nstart = 35) # Кол-во центройдов в начале
fviz_cluster(kmeans,data = G_scaled,
ellipse = FALSE,
show.clust.cent = FALSE,
geom = "point",
main = "Кластеризация субъектов по содержанию G-таксонов до коррекции batch-эффекта")
agnes2 <- data_wide_batched %>%
column_to_rownames("patient_ID") %>%
select (ends_with("_G")) %>%
vegdist(method = "bray") %>% #cоздаём матрицу дистанций, рассчитывая показатель несходства Брея — Кертиса (Bray-Curtis dissimilarity, 0 - полное совпадение пациентов по составу микробиоты, 1 - полное несовпадение
as.matrix() %>%
agnes( #выбираем Agglomerative clustering (иерархическая кластеризация снизу вверх)
diss = TRUE, #на вход подана dissimilarity matrix
method = "ward")
fviz_dend(agnes2,
cex = 0.2, k = clusters_number,
rect = TRUE,
k_colors = "jco",
color_labels_by_k = FALSE,
label_cols = case_when (data_wide_batched[agnes2$order,]$Age_range == "> 43" ~ "red",
data_wide_batched[agnes2$order,]$Age_range == "16-43" ~ "green",
TRUE ~ "white"),
main = "Дендрограмма кластеризации субъектов по количеству G-таксонов",
ylab = "Высота")
data_wide %>%
add_column("Cluster" = cutree(agnes2, k = clusters_number)) %>% # Создаём новый столбец - вектор принадлежности к кластерам
select(any_of(colnames (combined_info)), Cluster, - patient_ID) %>%
tbl_summary(digits = list(all_continuous() ~ c(0, 0),
all_categorical() ~ c(0, 0)),
by = Cluster) %>%
add_p()
| Characteristic | 1, N = 2091 | 2, N = 401 | 3, N = 581 | 4, N = 741 | p-value2 |
|---|---|---|---|---|---|
| research_ID | |||||
| 1 | 25 (12%) | 3 (8%) | 11 (19%) | 7 (9%) | |
| 2 | 20 (10%) | 1 (3%) | 3 (5%) | 12 (16%) | |
| 3 | 43 (21%) | 3 (8%) | 11 (19%) | 13 (18%) | |
| 4 | 47 (22%) | 16 (40%) | 9 (16%) | 15 (20%) | |
| 5 | 12 (6%) | 9 (23%) | 2 (3%) | 2 (3%) | |
| 6 | 13 (6%) | 3 (8%) | 4 (7%) | 2 (3%) | |
| 7 | 49 (23%) | 5 (13%) | 18 (31%) | 23 (31%) | |
| Seq_region | |||||
| V3-V4 | 63 (30%) | 4 (10%) | 14 (24%) | 25 (34%) | |
| V4 | 121 (58%) | 33 (83%) | 33 (57%) | 42 (57%) | |
| V5-V6 | 25 (12%) | 3 (8%) | 11 (19%) | 7 (9%) | |
| Seq_date | |||||
| 2015 | 49 (23%) | 5 (13%) | 18 (31%) | 23 (31%) | |
| 2017 | 12 (6%) | 9 (23%) | 2 (3%) | 2 (3%) | |
| 2018 | 50 (24%) | 8 (20%) | 16 (28%) | 22 (30%) | |
| 2020 | 24 (11%) | 3 (8%) | 5 (9%) | 7 (9%) | |
| 2021 | 16 (8%) | 9 (23%) | 2 (3%) | 5 (7%) | |
| 2022 | 14 (7%) | 3 (8%) | 4 (7%) | 2 (3%) | |
| 2023 | 44 (21%) | 3 (8%) | 11 (19%) | 13 (18%) | |
| Health_state | 0.3 | ||||
| Disease | 88 (42%) | 23 (58%) | 28 (48%) | 31 (42%) | |
| Health | 121 (58%) | 17 (43%) | 30 (52%) | 43 (58%) | |
| Age | 35 (28, 47) | 38 (31, 49) | 32 (28, 41) | 28 (26, 46) | 0.5 |
| Unknown | 137 | 12 | 43 | 55 | |
| Age_range | 0.002 | ||||
| > 43 | 21 (13%) | 13 (37%) | 4 (10%) | 6 (12%) | |
| 16-43 | 139 (87%) | 22 (63%) | 36 (90%) | 45 (88%) | |
| Unknown | 49 | 5 | 18 | 23 | |
| Sex | 0.015 | ||||
| female | 54 (51%) | 9 (31%) | 8 (35%) | 8 (24%) | |
| male | 51 (49%) | 20 (69%) | 15 (65%) | 26 (76%) | |
| Unknown | 104 | 11 | 35 | 40 | |
| Country | |||||
| Australia | 2 (1%) | 3 (8%) | 0 (0%) | 1 (1%) | |
| Austria | 12 (6%) | 12 (30%) | 2 (3%) | 2 (3%) | |
| Belgium | 25 (12%) | 3 (8%) | 11 (19%) | 7 (9%) | |
| Germany | 29 (14%) | 1 (3%) | 5 (9%) | 10 (14%) | |
| Greece | 1 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Hungary | 0 (0%) | 2 (5%) | 0 (0%) | 0 (0%) | |
| Ireland | 1 (0%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Israel | 14 (7%) | 3 (8%) | 4 (7%) | 2 (3%) | |
| Italy | 20 (10%) | 2 (5%) | 3 (5%) | 12 (16%) | |
| Norway | 0 (0%) | 0 (0%) | 0 (0%) | 1 (1%) | |
| Poland | 43 (21%) | 3 (8%) | 11 (19%) | 13 (18%) | |
| Spain | 49 (23%) | 5 (13%) | 18 (31%) | 23 (31%) | |
| UK | 2 (1%) | 4 (10%) | 4 (7%) | 2 (3%) | |
| USA | 11 (5%) | 2 (5%) | 0 (0%) | 1 (1%) | |
| Race | 0.6 | ||||
| African American | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Asian or Pacific Islander | 0 (0%) | 0 (0%) | 1 (5%) | 0 (0%) | |
| Caucasian | 60 (87%) | 19 (100%) | 18 (90%) | 19 (86%) | |
| Hispanic | 1 (1%) | 0 (0%) | 0 (0%) | 0 (0%) | |
| Other | 7 (10%) | 0 (0%) | 1 (5%) | 3 (14%) | |
| Unknown | 140 | 21 | 38 | 52 | |
| Smoking | 8 (7%) | 2 (9%) | 1 (3%) | 5 (14%) | 0.4 |
| Unknown | 94 | 18 | 27 | 39 | |
| Alcohol | 0.9 | ||||
| Never | 8 (17%) | 2 (13%) | 1 (11%) | 1 (7%) | |
| Occasionally (1-2 times/week) | 15 (32%) | 3 (19%) | 1 (11%) | 5 (33%) | |
| Rarely (a few times/month) | 17 (36%) | 8 (50%) | 4 (44%) | 7 (47%) | |
| Regularly (3-7 times/week) | 7 (15%) | 3 (19%) | 3 (33%) | 2 (13%) | |
| Unknown | 162 | 24 | 49 | 59 | |
| Antibiotics_usage | 0.2 | ||||
| 1-6 months | 25 (25%) | 5 (42%) | 11 (38%) | 7 (19%) | |
| 12 months/Not use | 75 (75%) | 7 (58%) | 18 (62%) | 29 (81%) | |
| Unknown | 109 | 28 | 29 | 38 | |
| Physical_activity | 8 (7%) | 2 (9%) | 1 (3%) | 5 (14%) | 0.4 |
| Unknown | 94 | 18 | 27 | 39 | |
| Travel_period | 0.2 | ||||
| 3 months | 11 (23%) | 4 (27%) | 1 (11%) | 2 (13%) | |
| 6 months | 5 (11%) | 0 (0%) | 1 (11%) | 4 (27%) | |
| Month | 7 (15%) | 5 (33%) | 1 (11%) | 0 (0%) | |
| Year | 24 (51%) | 6 (40%) | 6 (67%) | 9 (60%) | |
| Unknown | 162 | 25 | 49 | 59 | |
| Education_level | 0.8 | ||||
| Bachelor's level | 18 (38%) | 5 (33%) | 3 (33%) | 5 (38%) | |
| Master's level | 14 (30%) | 7 (47%) | 4 (44%) | 3 (23%) | |
| School Level | 15 (32%) | 3 (20%) | 2 (22%) | 5 (38%) | |
| Unknown | 162 | 25 | 49 | 61 | |
| Cosmetics | 0.10 | ||||
| Never | 19 (40%) | 12 (75%) | 4 (50%) | 9 (60%) | |
| Occasionally (a few-8 times/month) | 11 (23%) | 0 (0%) | 3 (38%) | 3 (20%) | |
| Regularly (3-7 times/week) | 17 (36%) | 4 (25%) | 1 (13%) | 3 (20%) | |
| Unknown | 162 | 24 | 50 | 59 | |
| Sleep_duration | 0.092 | ||||
| > 8 hours | 4 (9%) | 2 (13%) | 1 (11%) | 2 (13%) | |
| 4-6 hours | 2 (4%) | 5 (31%) | 1 (11%) | 1 (7%) | |
| 6-7 hours | 19 (40%) | 6 (38%) | 1 (11%) | 5 (33%) | |
| 7-8 hours | 22 (47%) | 3 (19%) | 6 (67%) | 7 (47%) | |
| Unknown | 162 | 24 | 49 | 59 | |
| BMI_category | 0.5 | ||||
| normal/overweight | 158 (98%) | 20 (100%) | 49 (98%) | 60 (98%) | |
| obese | 4 (2%) | 0 (0%) | 0 (0%) | 1 (2%) | |
| underweight | 0 (0%) | 0 (0%) | 1 (2%) | 0 (0%) | |
| Unknown | 47 | 20 | 8 | 13 | |
| 1 n (%); Median (IQR) | |||||
| 2 Pearson’s Chi-squared test; Kruskal-Wallis rank sum test; Fisher’s exact test | |||||
`
data_wide <- data_wide_batched
write_rds(data_wide,
file = "data/originals/data_wide.rds")
# Чтение листов Excel-файла с функциями бактерий и их объединение
path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")
neuromediators <- read_xlsx (path, 2) %>%
mutate(Destroy = ifelse(is.na (Destroy), "produce", "destroy")) %>%
unique() %>%
pivot_wider(names_from = Neuromediator, values_from = Destroy)
probiotics <- read_xlsx (path, 3) %>%
add_column(probiotics = 1)
special_properties <- read_xlsx (path, 4) %>%
add_column(special_properties = 1)
vitamins <- read_xlsx (path, 5) %>%
pivot_wider (names_from = Vitamin, values_from = Vitamin,
values_fn = function(x) ifelse(is.na (x), NA, 1))
habbits <- read_xlsx (path, 7) %>%
unique() %>% #удаление повторяющихся строк
pivot_wider(names_from = Habbit, values_from = Habit_state)
bac_functions <- read_xlsx (path, 1) %>% #Патогены и нежелательные
full_join(neuromediators, by = taxon) %>% #Нейромедиаторы
full_join(probiotics, by = taxon) %>% #Пробиотики
full_join(special_properties, by = taxon) %>% #С особыми свойствами
full_join(vitamins, by = taxon) %>% #Витамины
full_join(read_xlsx (path, 6), by = taxon) %>% #Продуценты КЦДК
full_join(habbits, by = taxon) %>% #Вредные привычки
unite("Taxon", TaxonName, Rank, sep = "_") %>%
filter (Taxon != "Blautia obeum_S") %>% #для данного таксона противоречивая информация в Продуценты КЦЖК
mutate_all(as.factor)
rm (path, taxon, neuromediators, probiotics, special_properties, vitamins, habbits)
data_wide в длинный формат
(data_long), объединение data_long и
bac_functions# Создание датасета в длинном формате
data_long <- data_wide %>%
pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
names_to = "Taxon", values_to = "Percentage")
#Перезапись data_long с добавлением функций бактерий
data_long <- data_long %>%
left_join (bac_functions, by = "Taxon")
#Сохранение data_long.rds
write_rds(data_long,
file = "data/originals/data_long.rds",
compress = "gz")
G_long <- data_long %>% subset(grepl("_G", Taxon))
# %>% select (where (function(x) sum (is.na(x))/ nrow(.) * 100 < threshold_percent))
F_long <- data_long %>% subset(grepl("_F", Taxon))
C_long <- data_long %>% subset(grepl("_C", Taxon))
O_long <- data_long %>% subset(grepl("_O", Taxon))
P_long <- data_long %>% subset(grepl("_P", Taxon))
write_rds(G_long,
file = "data/originals/G_long.rds", "gz")
write_rds(F_long,
file = "data/originals/F_long.rds", "gz")
write_rds(C_long,
file = "data/originals/C_long.rds", "gz")
write_rds(O_long,
file = "data/originals/O_long.rds", "gz")
write_rds(P_long,
file = "data/originals/P_long.rds", "gz")
comparison <- function (special_data, special_property) {
#vector of the G_taxa names with special properties:
bac_names <- special_data %>%
subset(grepl("_G", Taxon)) %>% .$Taxon
#number of the G_taxa in data_wide
number_of_taxons <- data_wide %>%
select(patient_ID, Health_state, any_of(bac_names)) %>%
ncol() - 2
# t-test
t <- data_wide %>%
select(patient_ID, Health_state, any_of(bac_names)) %>%
pivot_longer(ends_with("_G"),
names_to = "Taxon", values_to = "Percentage") %>%
group_by(patient_ID, Health_state) %>%
summarise(across(Percentage, sum)) %>%
ungroup() %>%
t.test (Percentage ~ Health_state, .)
#a tibble with results
tibble("G_taxa" = special_property,
"Number of taxa" = number_of_taxons,
"Mean of total percentage in IBS" = t$estimate["mean in group Disease"],
"Mean of total percentage in healthy" = t$estimate["mean in group Health"],
"95% CI_1" = t$conf.int[1],
"95% CI_2" = t$conf.int[2],
"p.unadjusted" = t$p.value)
}
comparison_in_total <- rbind(
comparison (bac_functions %>% filter (Серотонин == "produce"), "Serotonin producing"),
comparison (bac_functions %>% filter (Серотонин == "destroy"), "Serotonin destroying"),
comparison (bac_functions %>% filter (Ацетилхолин == "produce"), "Acetylcholine producing"),
comparison (bac_functions %>% filter (ГАМК == "produce"), "GABA producing"),
comparison (bac_functions %>% filter (ГАМК == "destroy"), "GABA destroying"),
comparison (bac_functions %>% filter (Дофамин == "produce"), "Dopamine producing"),
comparison (bac_functions %>% filter (Дофамин == "destroy"), "Dopamine destroying"),
comparison (bac_functions %>% filter (probiotics == 1), "Probiotics"),
comparison (bac_functions %>% filter (special_properties == 1), "Special_properties"),
comparison (bac_functions %>% filter (Gases == 1), "Gases producing"),
comparison (bac_functions %>% filter (Oral == 1), "Oral"),
comparison (bac_functions %>% filter (Inflammatory == 1), "Inflammatory"),
comparison (bac_functions %>% filter (Bacteria_category == "Патоген"), "Pathogens"),
comparison (bac_functions %>% filter (Bacteria_category == "Условно-нормальный"), "Conditionally normal"),
comparison (bac_functions %>% filter (A == 1), "Vitamin A producing"),
comparison (bac_functions %>% filter (B1 == 1), "Vitamin B1 producing"),
comparison (bac_functions %>% filter (B12 == 1), "Vitamin B12 producing"),
comparison (bac_functions %>% filter (B2 == 1), "Vitamin B2 producing"),
comparison (bac_functions %>% filter (B3 == 1), "Vitamin B3 producing"),
comparison (bac_functions %>% filter (B5 == 1), "Vitamin B5 producing"),
comparison (bac_functions %>% filter (B6 == 1), "Vitamin B6 producing"),
comparison (bac_functions %>% filter (B7 == 1), "Vitamin B7 producing"),
comparison (bac_functions %>% filter (B9 == 1), "Vitamin B9 producing"),
comparison (bac_functions %>% filter (D3 == 1), "Vitamin D3 producing"),
comparison (bac_functions %>% filter (K2 == 1), "Vitamin K2 producing"),
comparison (bac_functions %>% filter (Ацетат == 1), "Acetate producing"),
comparison (bac_functions %>% filter (Пропионат == 1), "Propionate producing"),
comparison (bac_functions %>% filter (`Масляная кислота` == 1), "Butyrate producing"),
comparison (bac_functions %>% filter (Кофе == "Увеличена"), "Increase with coffee"),
comparison (bac_functions %>% filter (Курение == "Увеличена"), "Increase with smoking"),
comparison (bac_functions %>% filter (Курение == "Уменьшена"), "Derease with smoking"),
comparison (bac_functions %>% filter (Алкоголь == "Увеличена"), "Increase with alcohol"),
comparison (bac_functions %>% filter (Алкоголь == "Уменьшена"), "Derease with alcohol")
) %>%
filter (`Number of taxa` >= 3) %>%
add_column(.before = "95% CI_1", "Difference in means" = .$"Mean of total percentage in IBS" - .$"Mean of total percentage in healthy") %>%
mutate (across (c ("Mean of total percentage in IBS", "Mean of total percentage in healthy", "Difference in means", "95% CI_1", "95% CI_2"), function (x) round (x,2)),
p.unadjusted = round (p.unadjusted, 3),
p.value.holm = p.adjust(p.unadjusted, "holm")) %>%
unite("95% CI_1", "95% CI_2", col = "95% CI", sep = ":") %>%
arrange(desc (abs (.$"Difference in means")))
write_rds(comparison_in_total,
file = "data/originals/comparison_in_total.rds")
comparison_in_total %>%
mutate (p.unadjusted = ifelse(p.unadjusted < 0.001, "< 0.001", p.unadjusted),
p.value.holm = ifelse(p.value.holm < 0.001, "< 0.001", p.value.holm)) %>%
flextable() %>% theme_box() %>%
align(align = "center", part = "all") %>%
flextable::footnote (i = 1, j = 7, value = as_paragraph (c ("Welch Two Sample t-test")),
ref_symbols = "1", part = "header") %>%
set_caption("Results of IBS/healthy people comparison of means of total percentage of G-taxa with special properties")
G_taxa | Number of taxa | Mean of total percentage in IBS | Mean of total percentage in healthy | Difference in means | 95% CI | p.unadjusted1 | p.value.holm |
|---|---|---|---|---|---|---|---|
Inflammatory | 48 | 5.34 | 3.00 | 2.34 | 1.65:3.03 | < 0.001 | < 0.001 |
Propionate producing | 18 | 22.95 | 20.66 | 2.29 | 0.11:4.48 | 0.04 | 0.8 |
Acetate producing | 31 | 26.19 | 24.30 | 1.89 | -0.31:4.08 | 0.092 | 1 |
Increase with alcohol | 3 | 15.25 | 13.38 | 1.87 | -0.14:3.87 | 0.068 | 1 |
Increase with coffee | 5 | 7.71 | 9.49 | -1.78 | -2.66:-0.89 | < 0.001 | < 0.001 |
Vitamin B12 producing | 4 | 4.69 | 6.23 | -1.54 | -2.35:-0.74 | < 0.001 | < 0.001 |
Pathogens | 7 | 1.31 | 0.07 | 1.24 | 0.91:1.57 | < 0.001 | < 0.001 |
Special_properties | 34 | 11.12 | 12.28 | -1.16 | -2.15:-0.17 | 0.022 | 0.484 |
Conditionally normal | 50 | 4.37 | 3.27 | 1.10 | 0.43:1.77 | 0.001 | 0.023 |
Gases producing | 9 | 2.77 | 2.01 | 0.76 | 0.38:1.14 | < 0.001 | < 0.001 |
Serotonin destroying | 7 | 2.97 | 2.25 | 0.72 | 0.09:1.36 | 0.026 | 0.546 |
Probiotics | 5 | 1.97 | 2.48 | -0.51 | -1.15:0.12 | 0.115 | 1 |
Vitamin B9 producing | 9 | 3.34 | 3.83 | -0.50 | -1.14:0.15 | 0.131 | 1 |
Serotonin producing | 20 | 5.75 | 6.20 | -0.45 | -1.24:0.34 | 0.266 | 1 |
Increase with smoking | 10 | 2.47 | 2.02 | 0.45 | -0.1:1 | 0.11 | 1 |
GABA producing | 6 | 2.82 | 3.27 | -0.44 | -1.09:0.2 | 0.175 | 1 |
Vitamin D3 producing | 4 | 2.90 | 3.28 | -0.38 | -1.04:0.28 | 0.257 | 1 |
Vitamin A producing | 4 | 2.77 | 3.13 | -0.36 | -1.02:0.29 | 0.279 | 1 |
Vitamin B1 producing | 7 | 3.00 | 3.36 | -0.36 | -1.04:0.31 | 0.292 | 1 |
Vitamin B5 producing | 13 | 3.90 | 3.57 | 0.33 | -0.13:0.79 | 0.159 | 1 |
Vitamin B2 producing | 29 | 7.46 | 7.73 | -0.27 | -1.11:0.57 | 0.534 | 1 |
Derease with smoking | 4 | 3.52 | 3.34 | 0.18 | -0.52:0.88 | 0.621 | 1 |
Butyrate producing | 34 | 25.06 | 25.16 | -0.11 | -2.17:1.95 | 0.919 | 1 |
Vitamin B3 producing | 16 | 6.90 | 7.00 | -0.10 | -0.92:0.72 | 0.811 | 1 |
Vitamin B6 producing | 25 | 5.90 | 5.81 | 0.09 | -0.71:0.89 | 0.829 | 1 |
Vitamin B7 producing | 14 | 3.01 | 3.09 | -0.08 | -0.58:0.42 | 0.75 | 1 |
Oral | 18 | 0.80 | 0.86 | -0.07 | -0.33:0.19 | 0.612 | 1 |
Vitamin K2 producing | 3 | 0.03 | 0.03 | 0.00 | -0.02:0.01 | 0.631 | 1 |
1Welch Two Sample t-test | |||||||
combined_bacteria с по группе Health_state из
combined_info без учета таксономического уровня.# Создание нового датасета для сохранения результатов
result_dataset <- data.frame(
Variable_Name = character(),
Test_Type = character(),
P_Value = numeric(),
Normal_Distribution = character(),
stringsAsFactors = FALSE
)
alpha = 0.05
# Объединяем датасеты по patient_id
combined_data <- data_wide %>%
select(Health_state,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")))
combined_bacteria_clean <- combined_data
# Получаем список переменных из датасета combined_bacteria, исключая "patient_ID"
values_to_exclude <- c("patient_ID", "Seq_date", "Age")
variable_names <- setdiff(names(combined_bacteria_clean), values_to_exclude)
# Проходим по каждой переменной
for (variable in variable_names) {
# Фильтрация данных (исключаем строки с NA и 0 в текущей переменной)
filtered_data <- combined_data[!is.na(combined_data[[variable]]) & combined_data[[variable]] != 0, ]
#print(variable)
#print(filtered_data)
# Проверим что датасет filtered_data не пустой и что количество групп сравнения более 1, в нашем случае их 2 :)
if (nrow(filtered_data) > 0 & length(unique(filtered_data$Health_state)) > 1) {
# Check if there is sufficient variability in the data
if (length(unique(filtered_data$Health_state)) > 1) {
# Проверка на нормальность
shapiro_test_result <- try(shapiro.test(filtered_data[[variable]]), silent = TRUE)
if (inherits(shapiro_test_result, "try-error")) {
warning(paste("Skipping Shapiro-Wilk test for variable", variable, "due to an error."))
next
}
p_value_shapiro <- shapiro_test_result$p.value
# Выбор соответствующего статистического теста
if (p_value_shapiro > 0.05) {
# Если нормальное распределение, провести дисперсионный анализ
model <- aov(filtered_data[[variable]] ~ Health_state, data = filtered_data)
summary_list <- summary(model)
test_type <- "ANOVA"
} else {
# Если не нормальное распределение, использовать тест Краскела-Уоллиса
tryCatch({
model <- kruskal.test(filtered_data[[variable]] ~ Health_state, data = filtered_data)
test_type <- "Kruskal-Wallis"
}, error = function(e) {
warning(paste("Skipping Kruskal-Wallis test for variable", variable, "due to an error:", conditionMessage(e)))
next
})
}
# Добавление результатов выбранного теста в датасет
result_dataset <- rbind(result_dataset,
data.frame(Variable_Name = variable,
Test_Type = test_type,
P_Value = ifelse(test_type == "ANOVA", summary_list[[1]]$"Pr(>F)"[1], model$p.value),
Normal_Distribution = ifelse(p_value_shapiro > 0.05, "Yes", "No")))
} else {
# If there is no variability, skip the tests
warning(paste("Skipping tests for variable", variable, "as there is not enough variability in the data."))
}
}}
# Добавление столбца с поправкой Бонферрони
result_dataset$Adjusted_P_Value_Bonferroni <- p.adjust(result_dataset$P_Value, method = "bonferroni")
# Добавление столбца с поправкой Холма
result_dataset$Adjusted_P_Value_Holm <- p.adjust(result_dataset$P_Value, method = "holm")
# Добавление столбца с поправкой Бенджамини-Хохберга
result_dataset$Adjusted_P_Value_BH <- p.adjust(result_dataset$P_Value, method = "BH")
result_dataset$test_pass = ifelse(result_dataset$Adjusted_P_Value_Bonferroni < alpha & result_dataset$Adjusted_P_Value_Holm < alpha & result_dataset$Adjusted_P_Value_BH < alpha, "Y", "N")
result_dataset_pass <- result_dataset %>% filter(test_pass == "Y")
# Вывод результатов
print(result_dataset_pass)
## Variable_Name Test_Type P_Value
## 1 Eukaryota_D Kruskal-Wallis 3.641890e-07
## 2 Acidobacteriota_P Kruskal-Wallis 4.931868e-06
## 3 Actinobacteriota_P Kruskal-Wallis 5.956848e-07
## 4 Armatimonadota_P Kruskal-Wallis 5.555616e-10
## 5 Bdellovibrionota_P Kruskal-Wallis 1.102695e-09
## 6 Campylobacterota_P Kruskal-Wallis 4.120866e-06
## 7 Cyanobacteria_P Kruskal-Wallis 2.162075e-22
## 8 Desulfobacterota_P Kruskal-Wallis 2.250375e-08
## 9 Fibrobacterota_P Kruskal-Wallis 9.167248e-08
## 10 Firmicutes_P Kruskal-Wallis 3.498944e-16
## 11 Fusobacteriota_P Kruskal-Wallis 2.304884e-12
## 12 Nitrospirota_P Kruskal-Wallis 7.764816e-07
## 13 Proteobacteria_P Kruskal-Wallis 1.121119e-31
## 14 Verrucomicrobiota_P Kruskal-Wallis 1.976683e-08
## 15 Bacillales_O Kruskal-Wallis 1.093474e-12
## 16 Bdellovibrionales_O Kruskal-Wallis 2.180046e-12
## 17 Burkholderiales_O Kruskal-Wallis 2.874918e-05
## 18 Clostridia UCG-014_O Kruskal-Wallis 3.732801e-19
## 19 Clostridia vadinBB60 group_O Kruskal-Wallis 3.209141e-28
## 20 Clostridiales_O Kruskal-Wallis 1.067999e-26
## 21 DTU014_O Kruskal-Wallis 1.461310e-07
## 22 Desulfitobacteriales_O Kruskal-Wallis 1.646832e-05
## 23 Desulfobacterales_O Kruskal-Wallis 3.271719e-07
## 24 Desulfovibrionales_O Kruskal-Wallis 4.598530e-08
## 25 Enterobacterales_O Kruskal-Wallis 3.493634e-43
## 26 Fibrobacterales_O Kruskal-Wallis 1.952118e-11
## 27 Fusobacteriales_O Kruskal-Wallis 2.304884e-12
## 28 Gastranaerophilales_O Kruskal-Wallis 1.784014e-33
## 29 Izemoplasmatales_O Kruskal-Wallis 3.915759e-32
## 30 Lactobacillales_O Kruskal-Wallis 1.933468e-09
## 31 Opitutales_O Kruskal-Wallis 5.646650e-07
## 32 Oscillospirales_O Kruskal-Wallis 6.906239e-11
## 33 Pedosphaerales_O Kruskal-Wallis 1.302641e-15
## 34 Peptostreptococcales-Tissierellales_O Kruskal-Wallis 1.840308e-26
## 35 Pirellulales_O Kruskal-Wallis 3.196572e-09
## 36 Propionibacteriales_O Kruskal-Wallis 1.267265e-13
## 37 Pseudomonadales_O Kruskal-Wallis 3.208077e-12
## 38 RF39_O Kruskal-Wallis 1.317623e-08
## 39 Rhodobacterales_O Kruskal-Wallis 9.110302e-22
## 40 Rhodospirillales_O Kruskal-Wallis 9.132432e-06
## 41 Rickettsiales_O Kruskal-Wallis 2.020389e-16
## 42 SBR1031_O Kruskal-Wallis 1.172360e-06
## 43 Staphylococcales_O Kruskal-Wallis 3.046562e-07
## 44 Veillonellales-Selenomonadales_O Kruskal-Wallis 1.606580e-11
## 45 Verrucomicrobiales_O Kruskal-Wallis 7.214817e-30
## 46 Victivallales_O Kruskal-Wallis 1.262006e-43
## 47 Xanthomonadales_O Kruskal-Wallis 2.298532e-35
## 48 Bdellovibrionia_C Kruskal-Wallis 1.961073e-08
## 49 Campylobacteria_C Kruskal-Wallis 4.506978e-06
## 50 Chloroflexia_C Kruskal-Wallis 1.333103e-52
## 51 Clostridia_C Kruskal-Wallis 2.869853e-19
## 52 Dehalococcoidia_C Kruskal-Wallis 1.999580e-05
## 53 Desulfitobacteriia_C Kruskal-Wallis 1.646832e-05
## 54 Desulfobacteria_C Kruskal-Wallis 2.570935e-06
## 55 Desulfovibrionia_C Kruskal-Wallis 4.598530e-08
## 56 Fibrobacteria_C Kruskal-Wallis 1.952118e-11
## 57 Fusobacteriia_C Kruskal-Wallis 2.304884e-12
## 58 Gammaproteobacteria_C Kruskal-Wallis 5.052401e-24
## 59 Gracilibacteria_C Kruskal-Wallis 3.949566e-09
## 60 Lentisphaeria_C Kruskal-Wallis 1.437163e-32
## 61 Microgenomatia_C Kruskal-Wallis 2.221509e-05
## 62 Negativicutes_C Kruskal-Wallis 1.630876e-10
## 63 Parcubacteria_C Kruskal-Wallis 4.142854e-11
## 64 Phycisphaerae_C Kruskal-Wallis 3.006549e-07
## 65 Vampirivibrionia_C Kruskal-Wallis 1.714190e-17
## 66 Verrucomicrobiae_C Kruskal-Wallis 5.551747e-20
## 67 vadinHA49_C Kruskal-Wallis 4.885983e-06
## 68 A4b_F Kruskal-Wallis 5.664201e-18
## 69 Aerococcaceae_F Kruskal-Wallis 1.005536e-10
## 70 Akkermansiaceae_F Kruskal-Wallis 1.680017e-29
## 71 Alteromonadaceae_F Kruskal-Wallis 5.510037e-25
## 72 Bacillaceae_F Kruskal-Wallis 2.467063e-12
## 73 Bdellovibrionaceae_F Kruskal-Wallis 2.180046e-12
## 74 Burkholderiaceae_F Kruskal-Wallis 4.981468e-21
## 75 Caloramatoraceae_F Kruskal-Wallis 2.538328e-21
## 76 Carnobacteriaceae_F Kruskal-Wallis 1.039051e-05
## 77 Clostridiaceae_F Kruskal-Wallis 8.902870e-27
## 78 Comamonadaceae_F Kruskal-Wallis 6.571902e-11
## 79 Coriobacteriaceae_F Kruskal-Wallis 4.658545e-12
## 80 Corynebacteriaceae_F Kruskal-Wallis 1.917933e-06
## 81 Desulfovibrionaceae_F Kruskal-Wallis 9.189183e-09
## 82 Dethiosulfatibacteraceae_F Kruskal-Wallis 3.006610e-48
## 83 Enterobacteriaceae_F Kruskal-Wallis 2.753246e-45
## 84 Erwiniaceae_F Kruskal-Wallis 1.426000e-46
## 85 Family XI_F Kruskal-Wallis 1.732333e-07
## 86 Fusibacteraceae_F Kruskal-Wallis 1.175304e-12
## 87 Fusobacteriaceae_F Kruskal-Wallis 1.130697e-06
## 88 Gemellaceae_F Kruskal-Wallis 3.263173e-17
## 89 Leptotrichiaceae_F Kruskal-Wallis 3.156349e-20
## 90 ML635J-40 aquatic group_F Kruskal-Wallis 1.343215e-06
## 91 Micrococcaceae_F Kruskal-Wallis 3.198537e-28
## 92 Moraxellaceae_F Kruskal-Wallis 3.150003e-27
## 93 Morganellaceae_F Kruskal-Wallis 2.198451e-15
## 94 Neisseriaceae_F Kruskal-Wallis 4.802029e-38
## 95 Oxalobacteraceae_F Kruskal-Wallis 1.372032e-12
## 96 Pasteurellaceae_F Kruskal-Wallis 1.032882e-24
## 97 Pedosphaeraceae_F Kruskal-Wallis 1.302641e-15
## 98 Peptostreptococcaceae_F Kruskal-Wallis 7.392404e-35
## 99 Pirellulaceae_F Kruskal-Wallis 3.196572e-09
## 100 Planococcaceae_F Kruskal-Wallis 3.691366e-18
## 101 Propionibacteriaceae_F Kruskal-Wallis 6.631706e-09
## 102 Pseudomonadaceae_F Kruskal-Wallis 3.996889e-22
## 103 Rhodobacteraceae_F Kruskal-Wallis 9.110302e-22
## 104 Rhodocyclaceae_F Kruskal-Wallis 7.399735e-20
## 105 Ruminococcaceae_F Kruskal-Wallis 1.582549e-08
## 106 SB-5_F Kruskal-Wallis 3.298360e-07
## 107 Shewanellaceae_F Kruskal-Wallis 5.652343e-08
## 108 Sutterellaceae_F Kruskal-Wallis 1.095159e-05
## 109 UCG-010_F Kruskal-Wallis 1.173862e-23
## 110 Veillonellaceae_F Kruskal-Wallis 2.025402e-12
## 111 Vibrionaceae_F Kruskal-Wallis 1.133000e-19
## 112 Victivallaceae_F Kruskal-Wallis 3.983873e-47
## 113 Weeksellaceae_F Kruskal-Wallis 7.658696e-15
## 114 Xanthomonadaceae_F Kruskal-Wallis 1.867012e-05
## 115 Yersiniaceae_F Kruskal-Wallis 1.036265e-37
## 116 [Eubacterium] coprostanoligenes group_F Kruskal-Wallis 1.838722e-27
## 117 Acetanaerobacterium_G Kruskal-Wallis 7.477478e-29
## 118 Acidaminococcus_G Kruskal-Wallis 2.000921e-05
## 119 Acinetobacter_G Kruskal-Wallis 7.548236e-30
## 120 Actinobacillus_G Kruskal-Wallis 9.404740e-23
## 121 Aeromonas_G Kruskal-Wallis 5.221036e-06
## 122 Akkermansia_G Kruskal-Wallis 1.680017e-29
## 123 Anaerococcus_G Kruskal-Wallis 1.600133e-09
## 124 Anaerocolumna_G Kruskal-Wallis 3.200634e-07
## 125 Anaerofilum_G Kruskal-Wallis 1.254379e-11
## 126 Anaerosporobacter_G Kruskal-Wallis 3.218415e-05
## 127 Anaerostipes_G Kruskal-Wallis 1.316182e-06
## 128 Bacillus_G Kruskal-Wallis 1.174276e-11
## 129 CAG-352_G Kruskal-Wallis 8.685810e-06
## 130 CAG-56_G Kruskal-Wallis 1.266937e-11
## 131 CHKCI001_G Kruskal-Wallis 6.501996e-06
## 132 CHKCI002_G Kruskal-Wallis 2.413639e-05
## 133 Candidatus Phytoplasma_G Kruskal-Wallis 7.466909e-06
## 134 Candidatus Stoquefichus_G Kruskal-Wallis 4.657966e-17
## 135 Caproiciproducens_G Kruskal-Wallis 1.982204e-06
## 136 Catenibacterium_G Kruskal-Wallis 5.744837e-07
## 137 Christensenella_G Kruskal-Wallis 4.540380e-13
## 138 Citrobacter_G Kruskal-Wallis 3.269696e-08
## 139 Cloacibacillus_G Kruskal-Wallis 2.656491e-19
## 140 Clostridium sensu stricto 13_G Kruskal-Wallis 1.765588e-33
## 141 Clostridium sensu stricto 1_G Kruskal-Wallis 1.001200e-29
## 142 Collinsella_G Kruskal-Wallis 1.584679e-11
## 143 DTU089_G Kruskal-Wallis 1.469309e-07
## 144 Desulfovibrio_G Kruskal-Wallis 8.406321e-18
## 145 Dethiosulfatibacter_G Kruskal-Wallis 3.006610e-48
## 146 Eggerthella_G Kruskal-Wallis 1.512409e-07
## 147 Enterobacter_G Kruskal-Wallis 4.283633e-49
## 148 Epulopiscium_G Kruskal-Wallis 4.252884e-05
## 149 Escherichia-Shigella_G Kruskal-Wallis 1.211205e-45
## 150 Faecalibacterium_G Kruskal-Wallis 1.487630e-10
## 151 Fastidiosipila_G Kruskal-Wallis 1.336125e-06
## 152 Fermentimonas_G Kruskal-Wallis 2.441946e-07
## 153 Fusibacter_G Kruskal-Wallis 1.175304e-12
## 154 Fusobacterium_G Kruskal-Wallis 1.088967e-09
## 155 GCA-900066575_G Kruskal-Wallis 6.399238e-14
## 156 GCA-900066755_G Kruskal-Wallis 7.629800e-39
## 157 Gemella_G Kruskal-Wallis 3.263173e-17
## 158 Geobacillus_G Kruskal-Wallis 2.557631e-41
## 159 Granulicatella_G Kruskal-Wallis 2.758976e-05
## 160 Halobacillus_G Kruskal-Wallis 4.880366e-05
## 161 Harryflintia_G Kruskal-Wallis 3.870844e-22
## 162 Hespellia_G Kruskal-Wallis 1.148969e-05
## 163 Holdemanella_G Kruskal-Wallis 2.340090e-13
## 164 Intestinimonas_G Kruskal-Wallis 9.422743e-07
## 165 Klebsiella_G Kruskal-Wallis 1.636362e-24
## 166 Lachnospiraceae NC2004 group_G Kruskal-Wallis 1.126652e-05
## 167 Lachnospiraceae NK4B4 group_G Kruskal-Wallis 3.406312e-05
## 168 Lachnospiraceae UCG-001_G Kruskal-Wallis 3.250466e-05
## 169 Lachnospiraceae UCG-002_G Kruskal-Wallis 2.994427e-09
## 170 Lachnospiraceae UCG-006_G Kruskal-Wallis 3.057389e-05
## 171 Lachnospiraceae UCG-008_G Kruskal-Wallis 3.008204e-05
## 172 Lachnospiraceae UCG-010_G Kruskal-Wallis 1.108805e-05
## 173 Lacticaseibacillus_G Kruskal-Wallis 9.016788e-11
## 174 Lysinibacillus_G Kruskal-Wallis 4.843857e-30
## 175 Macellibacteroides_G Kruskal-Wallis 8.252963e-10
## 176 Megamonas_G Kruskal-Wallis 2.480778e-05
## 177 Moryella_G Kruskal-Wallis 1.698210e-05
## 178 Negativibacillus_G Kruskal-Wallis 3.505090e-12
## 179 OM27 clade_G Kruskal-Wallis 2.689783e-40
## 180 Olsenella_G Kruskal-Wallis 6.665175e-09
## 181 Paludibacter_G Kruskal-Wallis 7.121028e-06
## 182 Pantoea_G Kruskal-Wallis 1.357796e-40
## 183 Paraclostridium_G Kruskal-Wallis 2.973228e-10
## 184 Paraprevotella_G Kruskal-Wallis 3.160620e-10
## 185 Parvimonas_G Kruskal-Wallis 4.199992e-28
## 186 Peptoniphilus_G Kruskal-Wallis 1.264624e-13
## 187 Peptostreptococcus_G Kruskal-Wallis 1.711585e-35
## 188 Porphyromonas_G Kruskal-Wallis 1.328965e-07
## 189 Prevotella_7_G Kruskal-Wallis 1.519248e-07
## 190 Prevotella_G Kruskal-Wallis 5.583780e-06
## 191 Prevotellaceae NK3B31 group_G Kruskal-Wallis 1.352145e-08
## 192 Prevotellaceae UCG-001_G Kruskal-Wallis 3.947135e-08
## 193 Robinsoniella_G Kruskal-Wallis 7.475667e-08
## 194 Romboutsia_G Kruskal-Wallis 2.498546e-34
## 195 Ruminiclostridium_G Kruskal-Wallis 8.539573e-14
## 196 Salmonella_G Kruskal-Wallis 3.407245e-50
## 197 Serratia_G Kruskal-Wallis 9.170091e-39
## 198 Shewanella_G Kruskal-Wallis 6.216394e-14
## 199 Sporobacter_G Kruskal-Wallis 8.346727e-13
## 200 Streptococcus_G Kruskal-Wallis 4.895500e-05
## 201 Subdoligranulum_G Kruskal-Wallis 1.307132e-12
## 202 Sutterella_G Kruskal-Wallis 3.585245e-19
## 203 Turicibacter_G Kruskal-Wallis 9.040827e-35
## 204 UCG-002_G Kruskal-Wallis 1.932637e-09
## 205 UCG-004_G Kruskal-Wallis 2.840630e-48
## 206 UCG-005_G Kruskal-Wallis 9.285072e-10
## 207 UCG-012_G Kruskal-Wallis 3.913332e-05
## 208 Veillonella_G Kruskal-Wallis 2.021157e-43
## 209 Vibrio_G Kruskal-Wallis 1.439854e-15
## 210 XBB1006_G Kruskal-Wallis 4.544741e-06
## 211 [Eubacterium] fissicatena group_G Kruskal-Wallis 6.453837e-08
## 212 [Ruminococcus] gnavus group_G Kruskal-Wallis 2.739855e-11
## Normal_Distribution Adjusted_P_Value_Bonferroni Adjusted_P_Value_Holm
## 1 No 3.084681e-04 2.498337e-04
## 2 No 4.177292e-03 3.294488e-03
## 3 No 5.045450e-04 4.068527e-04
## 4 No 4.705606e-07 4.038933e-07
## 5 No 9.339824e-07 7.972483e-07
## 6 No 3.490373e-03 2.769222e-03
## 7 No 1.831277e-19 1.721012e-19
## 8 No 1.906068e-05 1.591015e-05
## 9 No 7.764659e-05 6.407906e-05
## 10 No 2.963605e-13 2.704683e-13
## 11 No 1.952236e-09 1.730968e-09
## 12 No 6.576800e-04 5.295605e-04
## 13 No 9.495882e-29 9.170757e-29
## 14 No 1.674250e-05 1.399491e-05
## 15 No 9.261728e-10 8.299471e-10
## 16 No 1.846499e-09 1.641574e-09
## 17 No 2.435056e-02 1.854322e-02
## 18 No 3.161683e-16 2.919051e-16
## 19 No 2.718142e-25 2.596195e-25
## 20 No 9.045952e-24 8.586712e-24
## 21 No 1.237729e-04 1.018533e-04
## 22 No 1.394866e-02 1.078675e-02
## 23 No 2.771146e-04 2.250943e-04
## 24 No 3.894955e-05 3.237365e-05
## 25 No 2.959108e-40 2.917185e-40
## 26 No 1.653444e-08 1.442615e-08
## 27 No 1.952236e-09 1.730968e-09
## 28 No 1.511060e-30 1.464676e-30
## 29 No 3.316648e-29 3.207007e-29
## 30 No 1.637647e-06 1.393432e-06
## 31 No 4.782713e-04 3.867955e-04
## 32 No 5.849585e-08 5.069179e-08
## 33 No 1.103337e-12 1.005639e-12
## 34 No 1.558741e-23 1.477767e-23
## 35 No 2.707496e-06 2.295138e-06
## 36 No 1.073374e-10 9.669235e-11
## 37 No 2.717241e-09 2.396434e-09
## 38 No 1.116026e-05 9.381473e-06
## 39 No 7.716426e-19 7.224470e-19
## 40 No 7.735170e-03 6.036538e-03
## 41 No 1.711270e-13 1.563781e-13
## 42 No 9.929892e-04 7.960327e-04
## 43 No 2.580438e-04 2.102128e-04
## 44 No 1.360773e-08 1.188869e-08
## 45 No 6.110950e-27 5.887291e-27
## 46 No 1.068919e-40 1.056299e-40
## 47 No 1.946857e-32 1.898588e-32
## 48 No 1.661029e-05 1.390401e-05
## 49 No 3.817410e-03 3.024182e-03
## 50 No 1.129138e-49 1.129138e-49
## 51 No 2.430765e-16 2.249965e-16
## 52 No 1.693644e-02 1.301727e-02
## 53 No 1.394866e-02 1.078675e-02
## 54 No 2.177582e-03 1.730239e-03
## 55 No 3.894955e-05 3.237365e-05
## 56 No 1.653444e-08 1.442615e-08
## 57 No 1.952236e-09 1.730968e-09
## 58 No 4.279384e-21 4.036868e-21
## 59 No 3.345282e-06 2.827889e-06
## 60 No 1.217277e-29 1.178474e-29
## 61 No 1.881618e-02 1.441759e-02
## 62 No 1.381352e-07 1.190540e-07
## 63 No 3.508997e-08 3.049140e-08
## 64 No 2.546547e-04 2.077525e-04
## 65 No 1.451919e-14 1.333640e-14
## 66 No 4.702330e-17 4.374777e-17
## 67 No 4.138427e-03 3.268722e-03
## 68 No 4.797578e-15 4.418077e-15
## 69 No 8.516894e-08 7.360527e-08
## 70 No 1.422975e-26 1.365854e-26
## 71 No 4.667001e-22 4.419049e-22
## 72 No 2.089602e-09 1.845363e-09
## 73 No 1.846499e-09 1.641574e-09
## 74 No 4.219303e-18 3.935360e-18
## 75 No 2.149964e-18 2.007817e-18
## 76 No 8.800762e-03 6.857737e-03
## 77 No 7.540731e-24 7.166811e-24
## 78 No 5.566401e-08 4.830348e-08
## 79 No 3.945788e-09 3.470616e-09
## 80 No 1.624490e-03 1.294605e-03
## 81 No 7.783238e-06 6.551888e-06
## 82 No 2.546599e-45 2.534572e-45
## 83 No 2.331999e-42 2.307220e-42
## 84 No 1.207822e-43 1.197840e-43
## 85 No 1.467286e-04 1.200507e-04
## 86 No 9.954821e-10 8.908801e-10
## 87 No 9.577005e-04 7.688741e-04
## 88 No 2.763908e-14 2.535486e-14
## 89 No 2.673428e-17 2.490360e-17
## 90 No 1.137703e-03 9.080131e-04
## 91 No 2.709161e-25 2.590815e-25
## 92 No 2.668053e-24 2.538902e-24
## 93 No 1.862088e-12 1.690609e-12
## 94 No 4.067318e-35 3.980882e-35
## 95 No 1.162111e-09 1.035884e-09
## 96 No 8.748511e-22 8.273385e-22
## 97 No 1.103337e-12 1.005639e-12
## 98 No 6.261366e-32 6.098733e-32
## 99 No 2.707496e-06 2.295138e-06
## 100 No 3.126587e-15 2.882957e-15
## 101 No 5.617055e-06 4.741670e-06
## 102 No 3.385365e-19 3.173530e-19
## 103 No 7.716426e-19 7.224470e-19
## 104 No 6.267575e-17 5.823591e-17
## 105 No 1.340419e-05 1.123610e-05
## 106 No 2.793711e-04 2.265974e-04
## 107 No 4.787535e-05 3.967945e-05
## 108 No 9.275996e-03 7.217097e-03
## 109 No 9.942614e-21 9.367421e-21
## 110 No 1.715515e-09 1.527153e-09
## 111 No 9.596507e-17 8.905378e-17
## 112 No 3.374340e-44 3.350437e-44
## 113 No 6.486915e-12 5.881878e-12
## 114 No 1.581359e-02 1.217292e-02
## 115 No 8.777168e-35 8.580277e-35
## 116 No 1.557397e-24 1.483849e-24
## 117 No 6.333424e-26 6.064235e-26
## 118 No 1.694780e-02 1.301727e-02
## 119 No 6.393356e-27 6.151812e-27
## 120 No 7.965815e-20 7.495578e-20
## 121 No 4.422218e-03 3.482431e-03
## 122 No 1.422975e-26 1.365854e-26
## 123 No 1.355312e-06 1.155296e-06
## 124 No 2.710937e-04 2.205237e-04
## 125 No 1.062459e-08 9.320033e-09
## 126 No 2.725998e-02 2.066223e-02
## 127 No 1.114807e-03 8.923717e-04
## 128 No 9.946116e-09 8.736612e-09
## 129 No 7.356881e-03 5.750006e-03
## 130 No 1.073095e-08 9.400671e-09
## 131 No 5.507190e-03 4.323827e-03
## 132 No 2.044352e-02 1.564038e-02
## 133 No 6.324472e-03 4.950561e-03
## 134 No 3.945297e-14 3.609923e-14
## 135 No 1.678927e-03 1.336006e-03
## 136 No 4.865877e-04 3.929469e-04
## 137 No 3.845702e-10 3.455229e-10
## 138 No 2.769433e-05 2.308406e-05
## 139 No 2.250048e-16 2.085346e-16
## 140 No 1.495453e-30 1.451313e-30
## 141 No 8.480166e-27 8.149770e-27
## 142 No 1.342223e-08 1.174247e-08
## 143 No 1.244504e-04 1.022639e-04
## 144 No 7.120154e-15 6.548524e-15
## 145 No 2.546599e-45 2.534572e-45
## 146 No 1.281011e-04 1.051124e-04
## 147 No 3.628237e-46 3.619670e-46
## 148 No 3.602192e-02 2.713340e-02
## 149 No 1.025891e-42 1.016201e-42
## 150 No 1.260022e-07 1.087457e-07
## 151 No 1.131698e-03 9.045564e-04
## 152 No 2.068329e-04 1.689827e-04
## 153 No 9.954821e-10 8.908801e-10
## 154 No 9.223555e-07 7.884125e-07
## 155 No 5.420155e-11 4.901816e-11
## 156 No 6.462441e-36 6.340364e-36
## 157 No 2.763908e-14 2.535486e-14
## 158 No 2.166313e-38 2.133064e-38
## 159 No 2.336852e-02 1.782298e-02
## 160 No 4.133670e-02 3.108793e-02
## 161 No 3.278605e-19 3.077321e-19
## 162 No 9.731768e-03 7.537237e-03
## 163 No 1.982056e-10 1.783148e-10
## 164 No 7.981063e-04 6.416888e-04
## 165 No 1.385999e-21 1.309090e-21
## 166 No 9.542746e-03 7.402106e-03
## 167 No 2.885146e-02 2.180039e-02
## 168 No 2.753145e-02 2.083549e-02
## 169 No 2.536280e-06 2.152993e-06
## 170 No 2.589609e-02 1.965901e-02
## 171 No 2.547949e-02 1.937284e-02
## 172 No 9.391580e-03 7.295938e-03
## 173 No 7.637220e-08 6.609306e-08
## 174 No 4.102747e-27 3.957431e-27
## 175 No 6.990259e-07 5.991651e-07
## 176 No 2.101219e-02 1.605064e-02
## 177 No 1.438384e-02 1.108931e-02
## 178 No 2.968811e-09 2.614797e-09
## 179 No 2.278247e-37 2.237900e-37
## 180 No 5.645403e-06 4.758935e-06
## 181 No 6.031510e-03 4.728362e-03
## 182 No 1.150053e-37 1.131044e-37
## 183 No 2.518324e-07 2.167483e-07
## 184 No 2.677045e-07 2.300931e-07
## 185 No 3.557393e-25 3.393594e-25
## 186 No 1.071137e-10 9.661731e-11
## 187 No 1.449712e-32 1.415481e-32
## 188 No 1.125633e-04 9.276176e-05
## 189 No 1.286803e-04 1.054358e-04
## 190 No 4.729462e-03 3.718798e-03
## 191 No 1.145267e-05 9.613751e-06
## 192 No 3.343223e-05 2.782730e-05
## 193 No 6.331890e-05 5.232967e-05
## 194 No 2.116268e-31 2.056303e-31
## 195 No 7.233018e-11 6.532773e-11
## 196 No 2.885937e-47 2.882530e-47
## 197 No 7.767067e-36 7.611175e-36
## 198 No 5.265285e-11 4.767974e-11
## 199 No 7.069677e-10 6.343512e-10
## 200 No 4.146488e-02 3.113538e-02
## 201 No 1.107141e-09 9.881918e-10
## 202 No 3.036703e-16 2.807247e-16
## 203 No 7.657580e-32 7.449641e-32
## 204 No 1.636944e-06 1.393432e-06
## 205 No 2.406013e-45 2.397492e-45
## 206 No 7.864456e-07 6.731678e-07
## 207 No 3.314592e-02 2.500619e-02
## 208 No 1.711920e-40 1.689688e-40
## 209 No 1.219556e-12 1.108687e-12
## 210 No 3.849396e-03 3.044977e-03
## 211 No 5.466400e-05 4.524140e-05
## 212 No 2.320657e-08 2.019273e-08
## Adjusted_P_Value_BH test_pass
## 1 1.904124e-06 Y
## 2 2.320718e-05 Y
## 3 3.057848e-06 Y
## 4 3.888931e-09 Y
## 5 7.471859e-09 Y
## 6 1.983167e-05 Y
## 7 3.521687e-21 Y
## 8 1.351821e-07 Y
## 9 5.211180e-07 Y
## 10 3.951474e-15 Y
## 11 1.971956e-11 Y
## 12 3.961927e-06 Y
## 13 3.165294e-30 Y
## 14 1.195893e-07 Y
## 15 1.040644e-11 Y
## 16 1.923436e-11 Y
## 17 1.199535e-04 Y
## 18 4.790429e-18 Y
## 19 6.969596e-27 Y
## 20 2.055898e-25 Y
## 21 8.187529e-07 Y
## 22 7.190032e-05 Y
## 23 1.731966e-06 Y
## 24 2.686176e-07 Y
## 25 2.276237e-41 Y
## 26 1.503131e-10 Y
## 27 1.971956e-11 Y
## 28 5.596519e-32 Y
## 29 1.143672e-30 Y
## 30 1.279412e-08 Y
## 31 2.934179e-06 Y
## 32 5.131214e-10 Y
## 33 1.432905e-14 Y
## 34 3.463868e-25 Y
## 35 2.066791e-08 Y
## 36 1.262793e-12 Y
## 37 2.690338e-11 Y
## 38 8.206076e-08 Y
## 39 1.377933e-20 Y
## 40 4.136455e-05 Y
## 41 2.312527e-15 Y
## 42 5.875676e-06 Y
## 43 1.633189e-06 Y
## 44 1.259975e-10 Y
## 45 1.909672e-28 Y
## 46 9.717447e-42 Y
## 47 8.849349e-34 Y
## 48 1.194985e-07 Y
## 49 2.156729e-05 Y
## 50 1.129138e-49 Y
## 51 3.798071e-18 Y
## 52 8.559495e-05 Y
## 53 7.190032e-05 Y
## 54 1.244332e-05 Y
## 55 2.686176e-07 Y
## 56 1.503131e-10 Y
## 57 1.971956e-11 Y
## 58 8.733436e-23 Y
## 59 2.534305e-08 Y
## 60 4.347418e-31 Y
## 61 9.455367e-05 Y
## 62 1.170637e-09 Y
## 63 3.133033e-10 Y
## 64 1.622004e-06 Y
## 65 2.074170e-16 Y
## 66 7.837216e-19 Y
## 67 2.311971e-05 Y
## 68 7.055262e-17 Y
## 69 7.342150e-10 Y
## 70 3.952707e-28 Y
## 71 1.014565e-23 Y
## 72 2.089602e-11 Y
## 73 1.923436e-11 Y
## 74 7.274661e-20 Y
## 75 3.771866e-20 Y
## 76 4.681256e-05 Y
## 77 1.753658e-25 Y
## 78 4.926019e-10 Y
## 79 3.830862e-11 Y
## 80 9.390114e-06 Y
## 81 5.765362e-08 Y
## 82 4.244331e-46 Y
## 83 2.331999e-43 Y
## 84 1.509777e-44 Y
## 85 9.466363e-07 Y
## 86 1.093936e-11 Y
## 87 5.700598e-06 Y
## 88 3.838761e-16 Y
## 89 4.531234e-19 Y
## 90 6.614551e-06 Y
## 91 6.969596e-27 Y
## 92 6.352506e-26 Y
## 93 2.357074e-14 Y
## 94 2.140694e-36 Y
## 95 1.249581e-11 Y
## 96 1.861385e-23 Y
## 97 1.432905e-14 Y
## 98 2.722333e-33 Y
## 99 2.066791e-08 Y
## 100 4.666547e-17 Y
## 101 4.212988e-08 Y
## 102 6.269195e-21 Y
## 103 1.377933e-20 Y
## 104 1.027471e-18 Y
## 105 9.713180e-08 Y
## 106 1.735224e-06 Y
## 107 3.279133e-07 Y
## 108 4.907935e-05 Y
## 109 1.988523e-22 Y
## 110 1.825016e-11 Y
## 111 1.547824e-18 Y
## 112 4.820486e-45 Y
## 113 8.108644e-14 Y
## 114 8.068160e-05 Y
## 115 4.388584e-36 Y
## 116 3.798530e-26 Y
## 117 1.711736e-27 Y
## 118 8.559495e-05 Y
## 119 1.937381e-28 Y
## 120 1.561925e-21 Y
## 121 2.443214e-05 Y
## 122 3.952707e-28 Y
## 123 1.075645e-08 Y
## 124 1.704992e-06 Y
## 125 1.011865e-10 Y
## 126 1.323300e-04 Y
## 127 6.557686e-06 Y
## 128 9.563573e-11 Y
## 129 3.955312e-05 Y
## 130 1.012354e-10 Y
## 131 3.009394e-05 Y
## 132 1.022176e-04 Y
## 133 3.418633e-05 Y
## 134 5.404516e-16 Y
## 135 9.649006e-06 Y
## 136 2.966998e-06 Y
## 137 4.420347e-12 Y
## 138 1.950305e-07 Y
## 139 3.571505e-18 Y
## 140 5.596519e-32 Y
## 141 2.494167e-28 Y
## 142 1.254414e-10 Y
## 143 8.187529e-07 Y
## 144 1.031906e-16 Y
## 145 4.244331e-46 Y
## 146 8.355864e-07 Y
## 147 1.209412e-46 Y
## 148 1.715330e-04 Y
## 149 1.139878e-43 Y
## 150 1.076942e-09 Y
## 151 6.614551e-06 Y
## 152 1.325852e-06 Y
## 153 1.093936e-11 Y
## 154 7.438351e-09 Y
## 155 6.609945e-13 Y
## 156 3.801436e-37 Y
## 157 3.838761e-16 Y
## 158 1.547366e-39 Y
## 159 1.156858e-04 Y
## 160 1.955891e-04 Y
## 161 6.186047e-21 Y
## 162 5.068629e-05 Y
## 163 2.304716e-12 Y
## 164 4.779080e-06 Y
## 165 2.887497e-23 Y
## 166 4.996202e-05 Y
## 167 1.387089e-04 Y
## 168 1.330022e-04 Y
## 169 1.966108e-08 Y
## 170 1.263224e-04 Y
## 171 1.248995e-04 Y
## 172 4.942937e-05 Y
## 173 6.641061e-10 Y
## 174 1.323467e-28 Y
## 175 5.729721e-09 Y
## 176 1.045383e-04 Y
## 177 7.376328e-05 Y
## 178 2.910599e-11 Y
## 179 1.423904e-38 Y
## 180 4.212988e-08 Y
## 181 3.277995e-05 Y
## 182 7.667023e-39 Y
## 183 2.116239e-09 Y
## 184 2.230871e-09 Y
## 185 8.893484e-27 Y
## 186 1.262793e-12 Y
## 187 6.903392e-34 Y
## 188 7.504222e-07 Y
## 189 8.355864e-07 Y
## 190 2.598605e-05 Y
## 191 8.359612e-08 Y
## 192 2.337918e-07 Y
## 193 4.278304e-07 Y
## 194 8.465073e-33 Y
## 195 8.714480e-13 Y
## 196 1.442968e-47 Y
## 197 4.315037e-37 Y
## 198 6.500352e-13 Y
## 199 8.033724e-12 Y
## 200 1.955891e-04 Y
## 201 1.203414e-11 Y
## 202 4.671850e-18 Y
## 203 3.190658e-33 Y
## 204 1.279412e-08 Y
## 205 4.244331e-46 Y
## 206 6.393867e-09 Y
## 207 1.585929e-04 Y
## 208 1.426600e-41 Y
## 209 1.563533e-14 Y
## 210 2.162582e-05 Y
## 211 3.718639e-07 Y
## 212 2.090682e-10 Y
combined_bacteria_G <- combined_bacteria_clean %>%
select(Health_state, ends_with("_G"))
d <- dist(combined_bacteria_G) # euclidean distances between the rows
fit <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim
df_mds <- data.frame(
x = fit$points[,1],
y = fit$points[,2]
)
df_full <- cbind(df_mds, combined_info) %>% mutate(Health_state_n = case_when(Health_state == "Health" ~ 0,
Health_state == "Disease" ~ 1))
ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
geom_point() +
theme_bw() +
ggtitle("Распределение вектора таксонов в зависимости от группы пациентов")
adonis2(d ~ Health_state_n, data = df_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 1046 0.0122 4.6817 0.007 **
## Residual 379 84659 0.9878
## Total 380 85705 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Wilcox_comparison_round_0 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,0))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_0),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 100
## Количество значимо различающихся таксонов по p_value_holm 54
## Количество значимо различающихся таксонов по p_value_BH 100
Wilcox_comparison_round_1 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,1))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_1),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 273
## Количество значимо различающихся таксонов по p_value_holm 155
## Количество значимо различающихся таксонов по p_value_BH 273
Wilcox_comparison_round_2 <- data_wide %>%
select(Health_state, Archaea, Bacteria,
ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>%
mutate(across (where (is.numeric), function (x) round (x,2))) %>%
summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>%
pivot_longer(everything()) %>%
rename (Taxon = name, p_value = value) %>%
filter (p_value <= 0.05 ) %>%
arrange(p_value) %>%
add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>%
add_column(p_value_BH = p.adjust(.$p_value, "BH"))
rbind (
"Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_2),
"Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_holm <= 0.05 )),
"Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_BH <= 0.05 ))
)
## [,1]
## Количество значимо различающихся таксонов по p_value 599
## Количество значимо различающихся таксонов по p_value_holm 374
## Количество значимо различающихся таксонов по p_value_BH 599
library("haven")
library("ResourceSelection") ## Package to perform the Hosmer-Lemeshow GOF test
library("survey")
library("prediction")
library("margins")
library("ggeffects")
library("sjPlot")
library("statmod")
#devtools::install_github("strengejacke/strengejacke")
#install.packages(c("haven", "ResourceSelection", "survey", "prediction", "margins", "ggeffects", "sjPlot", "statmod"))
options(survey.lonely.psu = 'adjust')
start_values <- c(0, 0, 1)
analyze_taxon <- function(taxon_name, data) {
tryCatch(
{
data_filtered <- data %>%
filter(Taxon == taxon_name) %>%
select(patient_ID, Health_state, Percentage, Age_range, research_ID) %>%
mutate(Health_state_num = ifelse(Health_state == "Health", 0, 1)) # для упрацения интерпритации процентов
mepsdsgn = svydesign(
id = ~patient_ID,
strata = ~research_ID,
weights = NULL,
data = data_filtered,
nest = TRUE)
start_values <- c(0, 0, 1)
model <- svyglm(Percentage ~ Health_state_num + research_ID,
mepsdsgn,
family = tweedie(var.power = 2, link.power = 1),
start = start_values)
summary_table <- summary(model)
tidy_output <- tidy(model)
result_table1 <- tidy_output %>%
filter(term == "Health_state_num") %>%
select(estimate, p.value)
result_table2 <- as.data.frame(confint(model)["Health_state_num", ])
result_table2_transposed <- t(result_table2)
final_result_table_transposed <- bind_cols(result_table1, result_table2_transposed) %>%
select(estimate, `2.5 %`, `97.5 %`, p.value)
final_result_table_transposed$Taxon <- taxon_name
return(final_result_table_transposed)
},
error = function(e) {
# Обработка ошибки (можно добавить сообщение или просто вернуть NULL)
# cat("Error in analyze_taxon for Taxon:", taxon_name, "\n")
return(NULL)
}
)
}
# Применение функции ко всем таксонам
unique_taxa <- unique(G_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, G_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_G <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
#install.packages("knitr")
#install.packages("kableExtra")
# Загрузка библиотек
library(knitr)
library(kableExtra)
# Ваш код
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_G, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| 0.114 | 0.0529 | 0.1743 | 0.0002665 | Methanobrevibacter_G | 0.031 |
| -0.107 | -0.1274 | -0.0869 | 0.0000000 | Flavobacterium_G | 0.000 |
| 0.086 | 0.0564 | 0.1152 | 0.0000000 | Senegalimassilia_G | 0.000 |
| 0.065 | 0.0375 | 0.0929 | 0.0000052 | Solobacterium_G | 0.001 |
| 0.058 | 0.0444 | 0.0707 | 0.0000000 | Acidaminobacter_G | 0.000 |
| -0.050 | -0.0664 | -0.0329 | 0.0000000 | Alkaliflexus_G | 0.000 |
| 0.050 | 0.0277 | 0.0724 | 0.0000141 | Faecalitalea_G | 0.002 |
| 0.018 | 0.0117 | 0.0242 | 0.0000000 | Lactiplantibacillus_G | 0.000 |
| -0.016 | -0.0198 | -0.0116 | 0.0000000 | Thermacetogenium_G | 0.000 |
| 0.013 | 0.0071 | 0.0197 | 0.0000338 | Flavonifractor_G | 0.004 |
| 0.013 | 0.0082 | 0.0178 | 0.0000002 | Oxalobacter_G | 0.000 |
| 0.013 | 0.0095 | 0.0173 | 0.0000000 | Porphyrobacter_G | 0.000 |
| 0.010 | 0.0062 | 0.0129 | 0.0000000 | Anoxybacillus_G | 0.000 |
| 0.009 | 0.0068 | 0.0113 | 0.0000000 | Anaerostignum_G | 0.000 |
| -0.009 | -0.0113 | -0.0075 | 0.0000000 | Erysipelotrichaceae UCG-002_G | 0.000 |
| 0.009 | 0.0071 | 0.0104 | 0.0000000 | Halochromatium_G | 0.000 |
| 0.009 | 0.0063 | 0.0127 | 0.0000000 | Lachnospiraceae NK4B4 group_G | 0.000 |
| -0.007 | -0.0101 | -0.0038 | 0.0000201 | Candidatus Armantifilum_G | 0.003 |
| 0.007 | 0.0057 | 0.0084 | 0.0000000 | Colwellia_G | 0.000 |
| 0.007 | 0.0036 | 0.0112 | 0.0001343 | Marinobacter_G | 0.016 |
| 0.006 | 0.0047 | 0.0079 | 0.0000000 | Hespellia_G | 0.000 |
| -0.006 | -0.0086 | -0.0030 | 0.0000666 | Rikenella_G | 0.008 |
| -0.006 | -0.0077 | -0.0052 | 0.0000000 | Sedimentibacter_G | 0.000 |
| 0.004 | 0.0024 | 0.0058 | 0.0000022 | Acholeplasma_G | 0.000 |
| -0.004 | -0.0046 | -0.0029 | 0.0000000 | Crocinitomix_G | 0.000 |
| 0.004 | 0.0022 | 0.0052 | 0.0000021 | Lentilactobacillus_G | 0.000 |
| -0.004 | -0.0062 | -0.0024 | 0.0000098 | Macellibacteroides_G | 0.001 |
| 0.004 | 0.0030 | 0.0047 | 0.0000000 | Pseudarthrobacter_G | 0.000 |
| 0.004 | 0.0019 | 0.0054 | 0.0000631 | Rubrobacter_G | 0.008 |
| 0.004 | 0.0032 | 0.0048 | 0.0000000 | Syntrophomonas_G | 0.000 |
| -0.003 | -0.0036 | -0.0016 | 0.0000004 | Achromobacter_G | 0.000 |
| 0.003 | 0.0013 | 0.0041 | 0.0001600 | Antarcticibacterium_G | 0.019 |
| -0.003 | -0.0047 | -0.0019 | 0.0000036 | Coriobacteriaceae UCG-002_G | 0.001 |
| -0.003 | -0.0037 | -0.0016 | 0.0000009 | Desulfurispora_G | 0.000 |
| -0.003 | -0.0031 | -0.0021 | 0.0000000 | Formosa_G | 0.000 |
| 0.003 | 0.0015 | 0.0040 | 0.0000102 | Halobacillus_G | 0.001 |
| 0.003 | 0.0022 | 0.0046 | 0.0000000 | Hassallia_G | 0.000 |
| -0.003 | -0.0043 | -0.0014 | 0.0001027 | Hydrogenispora_G | 0.013 |
| -0.003 | -0.0035 | -0.0023 | 0.0000000 | Lachnotalea_G | 0.000 |
| -0.003 | -0.0036 | -0.0016 | 0.0000005 | Leptococcus JA-3-3Ab_G | 0.000 |
| -0.003 | -0.0045 | -0.0016 | 0.0000334 | Marinifilum_G | 0.004 |
| -0.003 | -0.0037 | -0.0021 | 0.0000000 | Oxobacter_G | 0.000 |
| -0.003 | -0.0039 | -0.0022 | 0.0000000 | Paramaledivibacter_G | 0.000 |
| -0.003 | -0.0042 | -0.0016 | 0.0000109 | Pygmaiobacter_G | 0.001 |
| 0.003 | 0.0012 | 0.0040 | 0.0001789 | RB41_G | 0.021 |
| -0.003 | -0.0046 | -0.0017 | 0.0000214 | Salinibacter_G | 0.003 |
| -0.003 | -0.0033 | -0.0021 | 0.0000000 | Serpentinicella_G | 0.000 |
| -0.003 | -0.0032 | -0.0023 | 0.0000000 | Subsaximicrobium_G | 0.000 |
| -0.003 | -0.0040 | -0.0013 | 0.0000805 | XBB1006_G | 0.010 |
| -0.002 | -0.0031 | -0.0015 | 0.0000001 | Acetoanaerobium_G | 0.000 |
| 0.002 | 0.0011 | 0.0023 | 0.0000001 | Anaerospora_G | 0.000 |
| -0.002 | -0.0026 | -0.0010 | 0.0000107 | Chthoniobacter_G | 0.001 |
| 0.002 | 0.0010 | 0.0028 | 0.0000743 | Nocardia_G | 0.009 |
| 0.002 | 0.0012 | 0.0038 | 0.0001601 | Spirochaeta 2_G | 0.019 |
| 0.002 | 0.0012 | 0.0023 | 0.0000000 | Truepera_G | 0.000 |
| -0.001 | -0.0020 | -0.0008 | 0.0000040 | Aeriscardovia_G | 0.001 |
| -0.001 | -0.0013 | -0.0006 | 0.0000014 | Arcicella_G | 0.000 |
| -0.001 | -0.0020 | -0.0009 | 0.0000008 | Blvii28 wastewater-sludge group_G | 0.000 |
| -0.001 | -0.0014 | -0.0006 | 0.0000020 | Clostridiisalibacter_G | 0.000 |
| -0.001 | -0.0021 | -0.0008 | 0.0000194 | Izimaplasma_G | 0.003 |
| -0.001 | -0.0021 | -0.0007 | 0.0000428 | Lawsonella_G | 0.005 |
| 0.001 | 0.0008 | 0.0017 | 0.0000000 | Leucobacter_G | 0.000 |
| -0.001 | -0.0014 | -0.0006 | 0.0000029 | Pseudoclostridium_G | 0.000 |
| 0.001 | 0.0007 | 0.0017 | 0.0000031 | Sporanaerobacter_G | 0.000 |
| -0.001 | -0.0018 | -0.0007 | 0.0000117 | Virgibacillus_G | 0.002 |
| -0.001 | -0.0015 | -0.0007 | 0.0000001 | [Eubacterium] yurii group_G | 0.000 |
# Применение функции ко всем таксонам
unique_taxa <- unique(F_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, F_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_F <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_F, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| 0.891 | 0.7287 | 1.0532 | 0.0000000 | Pedosphaeraceae_F | 0.000 |
| 0.301 | 0.2544 | 0.3471 | 0.0000000 | A4b_F | 0.000 |
| -0.080 | -0.1152 | -0.0445 | 0.0000117 | Methanobacteriaceae_F | 0.001 |
| 0.063 | 0.0480 | 0.0783 | 0.0000000 | Chromatiaceae_F | 0.000 |
| 0.058 | 0.0444 | 0.0707 | 0.0000000 | Acidaminobacteraceae_F | 0.000 |
| -0.055 | -0.0739 | -0.0360 | 0.0000000 | Clade II_F | 0.000 |
| -0.021 | -0.0329 | -0.0097 | 0.0003409 | Puniceicoccaceae_F | 0.017 |
| -0.016 | -0.0198 | -0.0117 | 0.0000000 | Thermacetogeniaceae_F | 0.000 |
| 0.010 | 0.0054 | 0.0150 | 0.0000326 | Sphingomonadaceae_F | 0.002 |
| 0.007 | 0.0036 | 0.0112 | 0.0001343 | Marinobacteraceae_F | 0.007 |
| 0.007 | 0.0036 | 0.0111 | 0.0001574 | Micromonosporaceae_F | 0.008 |
| 0.006 | 0.0038 | 0.0079 | 0.0000000 | Colwelliaceae_F | 0.000 |
| 0.006 | 0.0037 | 0.0082 | 0.0000004 | Hydrogenedensaceae_F | 0.000 |
| 0.006 | 0.0051 | 0.0073 | 0.0000000 | Iamiaceae_F | 0.000 |
| 0.006 | 0.0043 | 0.0076 | 0.0000000 | NS11-12 marine group_F | 0.000 |
| -0.006 | -0.0077 | -0.0052 | 0.0000000 | Sedimentibacteraceae_F | 0.000 |
| 0.006 | 0.0034 | 0.0078 | 0.0000009 | WD2101 soil group_F | 0.000 |
| -0.004 | -0.0042 | -0.0032 | 0.0000000 | Aureobasidiaceae_F | 0.000 |
| -0.004 | -0.0057 | -0.0016 | 0.0004772 | Didymellaceae_F | 0.024 |
| 0.004 | 0.0019 | 0.0054 | 0.0000631 | Rubrobacteriaceae_F | 0.004 |
| 0.004 | 0.0031 | 0.0055 | 0.0000000 | Syntrophomonadaceae_F | 0.000 |
| 0.004 | 0.0022 | 0.0059 | 0.0000247 | Thermomonosporaceae_F | 0.001 |
| -0.003 | -0.0043 | -0.0017 | 0.0000112 | Chthoniobacteraceae_F | 0.001 |
| -0.003 | -0.0037 | -0.0016 | 0.0000009 | Desulfurisporaceae_F | 0.000 |
| -0.003 | -0.0036 | -0.0016 | 0.0000005 | Leptococcaceae_F | 0.000 |
| 0.003 | 0.0020 | 0.0034 | 0.0000000 | Limnochordaceae_F | 0.000 |
| 0.003 | 0.0019 | 0.0033 | 0.0000000 | Obscuribacteraceae_F | 0.000 |
| -0.003 | -0.0037 | -0.0021 | 0.0000000 | Oxobacteraceae_F | 0.000 |
| -0.003 | -0.0040 | -0.0014 | 0.0000790 | Pirellulaceae_F | 0.004 |
| 0.003 | 0.0013 | 0.0040 | 0.0001420 | Pyrinomonadaceae_F | 0.008 |
| -0.003 | -0.0044 | -0.0013 | 0.0003096 | Salinivirgaceae_F | 0.016 |
| 0.003 | 0.0020 | 0.0038 | 0.0000000 | Xanthobacteraceae_F | 0.000 |
| 0.002 | 0.0019 | 0.0030 | 0.0000000 | 09D2Z48_F | 0.000 |
| -0.002 | -0.0026 | -0.0012 | 0.0000004 | Izemoplasmataceae_F | 0.000 |
| -0.002 | -0.0033 | -0.0015 | 0.0000001 | PeH15_F | 0.000 |
| 0.002 | 0.0007 | 0.0025 | 0.0006969 | Rs-E47 termite group_F | 0.033 |
| 0.002 | 0.0020 | 0.0028 | 0.0000000 | Simkaniaceae_F | 0.000 |
| -0.002 | -0.0038 | -0.0010 | 0.0006464 | TRA3-20_F | 0.031 |
| -0.002 | -0.0033 | -0.0015 | 0.0000001 | Trichocomaceae_F | 0.000 |
| 0.002 | 0.0012 | 0.0023 | 0.0000000 | Trueperaceae_F | 0.000 |
| 0.002 | 0.0016 | 0.0030 | 0.0000000 | type III_F | 0.000 |
| -0.001 | -0.0020 | -0.0008 | 0.0000131 | 01D2Z36_F | 0.001 |
| -0.001 | -0.0012 | -0.0003 | 0.0005701 | Bacteriovoracaceae_F | 0.028 |
| -0.001 | -0.0019 | -0.0009 | 0.0000003 | Cladosporiaceae_F | 0.000 |
| 0.001 | 0.0006 | 0.0021 | 0.0006965 | Halothiobacillaceae_F | 0.033 |
| 0.001 | 0.0009 | 0.0017 | 0.0000000 | Streptosporangiaceae_F | 0.000 |
# Применение функции ко всем таксонам
unique_taxa <- unique(C_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, C_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_C <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_C, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| -0.080 | -0.1152 | -0.0445 | 1.17e-05 | Methanobacteria_C | 0.000 |
| 0.036 | 0.0275 | 0.0435 | 0.00e+00 | Thermodesulfovibrionia_C | 0.000 |
| 0.022 | 0.0172 | 0.0265 | 0.00e+00 | Armatimonadia_C | 0.000 |
| -0.017 | -0.0222 | -0.0124 | 0.00e+00 | Dothideomycetes_C | 0.000 |
| -0.016 | -0.0198 | -0.0117 | 0.00e+00 | Thermacetogenia_C | 0.000 |
| 0.014 | 0.0105 | 0.0166 | 0.00e+00 | Acidimicrobiia_C | 0.000 |
| 0.012 | 0.0090 | 0.0144 | 0.00e+00 | Chlamydiae_C | 0.000 |
| 0.010 | 0.0072 | 0.0126 | 0.00e+00 | Dehalococcoidia_C | 0.000 |
| 0.009 | 0.0064 | 0.0108 | 0.00e+00 | Desulfotomaculia_C | 0.000 |
| -0.006 | -0.0074 | -0.0036 | 0.00e+00 | D8A-2_C | 0.000 |
| 0.006 | 0.0037 | 0.0082 | 4.00e-07 | Hydrogenedentia_C | 0.000 |
| 0.006 | 0.0042 | 0.0076 | 0.00e+00 | Thermoleophilia_C | 0.000 |
| 0.006 | 0.0034 | 0.0078 | 1.00e-06 | Vicinamibacteria_C | 0.000 |
| 0.005 | 0.0030 | 0.0071 | 2.80e-06 | Ignavibacteria_C | 0.000 |
| -0.004 | -0.0052 | -0.0026 | 0.00e+00 | Eurotiomycetes_C | 0.000 |
| 0.004 | 0.0019 | 0.0054 | 6.31e-05 | Rubrobacteria_C | 0.001 |
| 0.004 | 0.0031 | 0.0055 | 0.00e+00 | Syntrophomonadia_C | 0.000 |
| 0.004 | 0.0026 | 0.0055 | 0.00e+00 | vadinHA49_C | 0.000 |
| 0.003 | 0.0017 | 0.0042 | 4.70e-06 | MB-A2-108_C | 0.000 |
# Применение функции ко всем таксонам
unique_taxa <- unique(O_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, O_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_O <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_O, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| 0.891 | 0.7287 | 1.0532 | 0.0000000 | Pedosphaerales_O | 0.000 |
| -0.080 | -0.1152 | -0.0445 | 0.0000117 | Methanobacteriales_O | 0.000 |
| 0.062 | 0.0465 | 0.0773 | 0.0000000 | Chromatiales_O | 0.000 |
| 0.024 | 0.0176 | 0.0306 | 0.0000000 | Rhizobiales_O | 0.000 |
| 0.022 | 0.0172 | 0.0265 | 0.0000000 | Armatimonadales_O | 0.000 |
| -0.016 | -0.0198 | -0.0117 | 0.0000000 | Thermacetogeniales_O | 0.000 |
| -0.012 | -0.0141 | -0.0100 | 0.0000000 | Candidatus Kerfeldbacteria_O | 0.000 |
| -0.011 | -0.0162 | -0.0051 | 0.0001987 | Hypocreales_O | 0.007 |
| 0.010 | 0.0087 | 0.0120 | 0.0000000 | LD1-PA32_O | 0.000 |
| 0.010 | 0.0054 | 0.0150 | 0.0000326 | Sphingomonadales_O | 0.001 |
| 0.008 | 0.0054 | 0.0096 | 0.0000000 | Desulfotomaculales_O | 0.000 |
| 0.008 | 0.0059 | 0.0106 | 0.0000000 | Microtrichales_O | 0.000 |
| -0.007 | -0.0093 | -0.0046 | 0.0000000 | Chthoniobacterales_O | 0.000 |
| 0.007 | 0.0036 | 0.0111 | 0.0001574 | Micromonosporales_O | 0.006 |
| 0.006 | 0.0037 | 0.0082 | 0.0000004 | Hydrogenedentiales_O | 0.000 |
| -0.006 | -0.0092 | -0.0033 | 0.0000447 | Pleosporales_O | 0.002 |
| -0.005 | -0.0051 | -0.0040 | 0.0000000 | Dothideales_O | 0.000 |
| 0.005 | 0.0032 | 0.0077 | 0.0000024 | Tepidisphaerales_O | 0.000 |
| 0.004 | 0.0014 | 0.0058 | 0.0015949 | Entomoplasmatales_O | 0.048 |
| 0.004 | 0.0019 | 0.0054 | 0.0000631 | Rubrobacterales_O | 0.002 |
| 0.004 | 0.0031 | 0.0055 | 0.0000000 | Syntrophomonadales_O | 0.000 |
| -0.003 | -0.0036 | -0.0016 | 0.0000005 | Eurycoccales_O | 0.000 |
| 0.003 | 0.0022 | 0.0037 | 0.0000000 | Limnochordales_O | 0.000 |
| 0.003 | 0.0019 | 0.0033 | 0.0000000 | Obscuribacterales_O | 0.000 |
| -0.003 | -0.0040 | -0.0014 | 0.0000790 | Pirellulales_O | 0.003 |
| 0.003 | 0.0013 | 0.0040 | 0.0001420 | Pyrinomonadales_O | 0.005 |
| -0.003 | -0.0042 | -0.0017 | 0.0000072 | RBG-13-54-9_O | 0.000 |
| 0.003 | 0.0015 | 0.0053 | 0.0003859 | Synechococcales_O | 0.012 |
| 0.003 | 0.0019 | 0.0034 | 0.0000000 | eub62A3_O | 0.000 |
| 0.002 | 0.0009 | 0.0027 | 0.0000904 | Candidatus Abawacabacteria_O | 0.003 |
| 0.002 | 0.0010 | 0.0021 | 0.0000002 | Candidatus Peregrinibacteria_O | 0.000 |
| 0.002 | 0.0011 | 0.0020 | 0.0000000 | Candidatus Terrybacteria_O | 0.000 |
| -0.002 | -0.0029 | -0.0012 | 0.0000018 | Capnodiales_O | 0.000 |
| -0.002 | -0.0033 | -0.0015 | 0.0000001 | Eurotiales_O | 0.000 |
| 0.002 | 0.0017 | 0.0033 | 0.0000000 | Gaiellales_O | 0.000 |
| 0.002 | 0.0009 | 0.0032 | 0.0003267 | Ignavibacteriales_O | 0.011 |
| -0.001 | -0.0012 | -0.0003 | 0.0005701 | Bacteriovoracales_O | 0.018 |
| 0.001 | 0.0008 | 0.0018 | 0.0000027 | S085_O | 0.000 |
# Применение функции ко всем таксонам
unique_taxa <- unique(P_long$Taxon)
result_list <- lapply(unique_taxa, function(taxon) {
analyze_result <- analyze_taxon(taxon, P_long)
if (!is.null(analyze_result)) {
return(analyze_result)
} else {
return(data.frame()) # Вернуть пустой data.frame, чтобы не влиять на bind_rows
}
})
# Объединение результатов в один dataframe
final_result_df <- bind_rows(result_list) %>%
mutate(estimate = round(estimate, 3),
`2.5 %` = round(`2.5 %`, 4),
`97.5 %` = round(`97.5 %`, 4)
)
# Коррекция на множественные сравнения по методу Холма
final_result_df$p.adjusted <- round(p.adjust(final_result_df$p.value, method = "holm"),3)
# Фильтрация только статистически значимых результатов
significant_results_P <- final_result_df %>%
filter(p.adjusted < 0.05) %>% arrange(desc(abs(estimate)))
# Вывод таблички
cat("Статистически значимые результаты:\n")
## Статистически значимые результаты:
kable(significant_results_P, "html") %>%
kable_styling(full_width = F)
| estimate | 2.5 % | 97.5 % | p.value | Taxon | p.adjusted |
|---|---|---|---|---|---|
| -15.536 | -22.0641 | -9.0082 | 4.00e-06 | Firmicutes_P | 0.000 |
| -0.080 | -0.1152 | -0.0445 | 1.17e-05 | Euryarchaeota_P | 0.000 |
| 0.038 | 0.0302 | 0.0467 | 0.00e+00 | Armatimonadota_P | 0.000 |
| 0.038 | 0.0292 | 0.0458 | 0.00e+00 | Nitrospirota_P | 0.000 |
| -0.015 | -0.0212 | -0.0090 | 1.50e-06 | Basidiomycota_P | 0.000 |
| 0.006 | 0.0037 | 0.0082 | 4.00e-07 | Hydrogenedentes_P | 0.000 |
| 0.006 | 0.0032 | 0.0097 | 9.66e-05 | Myxococcota_P | 0.001 |
create_and_plot_taxon_groups <- function(filter_pattern, step_size, dataset_name) {
# Получаем уникальные таксоны, удовлетворяющие условиям
unique_taxa <- Wilcox_comparison_round_2 %>%
subset(grepl(filter_pattern, Taxon)) %>%
filter(p_value_holm < 0.05) %>%
distinct(Taxon)
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим графики
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
current_dataset <- get(dataset_name) # Получаем датасет по его имени
G_long_filtered <- current_dataset %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
group_by(Health_state, Taxon) %>%
summarise(Count = n(), .groups = "drop")
bar_plot <- ggplot(G_long_filtered, aes(x = Health_state, y = Count, fill = Health_state == "Disease")) +
geom_bar(position = "dodge", color = "black", stat = "identity") +
scale_fill_manual(values = c("skyblue", "red"), guide = FALSE) +
labs(title = paste("Гистограмма для таксонов", (i - 1) * step_size + 1, "до", min(i * step_size, nrow(unique_taxa)), "в разрезе Health_state"),
x = "Статус пациента",
y = "Количество пациентов, у которых обнаружен данный таксон") +
coord_flip() +
facet_wrap(~Taxon, scales = "free_y", ncol = 2, shrink = 0.7) + # Уменьшение пространства между фасетами
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5), # Поворот текста на оси X и уменьшение шрифта
axis.text.y = element_text(hjust = 1, size = 5),
strip.text = element_text(size = 5)) # Уменьшение шрифта для названия фасет
print(bar_plot) # Отображение графика на экране
}
}
# Пример вызова функции с передачей "_G" в качестве аргумента и названия датасета "G_long"
create_and_plot_taxon_groups("_G", 18, "G_long")
create_and_plot_circular_diagrams <- function(dataset, filter_pattern, step_size, filename) {
# Получаем уникальные таксоны, удовлетворяющие условиям
unique_taxa <- Wilcox_comparison_round_2 %>%
subset(grepl(filter_pattern, Taxon)) %>%
filter(p_value_holm < 0.05) %>%
distinct(Taxon)
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим графики
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- dataset %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
group_by(Health_state, Taxon) %>%
summarise(Count = n(), .groups = "drop")
# Добавляем строки с Count = 0 для всех комбинаций Health_state и Taxon
G_long_filtered <- G_long_filtered %>%
complete(Health_state, Taxon, fill = list(Count = 0))
data_id <- G_long_filtered %>% filter(G_long_filtered$Health_state == "Disease") %>%
mutate(id = row_number()) %>%
select(Taxon, id)
G_long_filtered <- left_join(G_long_filtered, data_id, by = "Taxon")
labels_data <- G_long_filtered %>% filter(Health_state == "Disease")
number_of_bars <- nrow(labels_data)
#Вычислим углы для лейбла каждого барра
labels_data$angel <- 90 - 360 * (labels_data$id-0.5) / number_of_bars
# Добавим горизонтольную регулировку
labels_data <- labels_data %>%
mutate(hjust = ifelse(angel < -90, 1, 0))
# Перевернем лейбл в зависимости от "полушария"
labels_data <- labels_data %>%
mutate(angel = ifelse(angel < -90, angel + 180, angel))
# Круговая диаграмма для объединенного датасета
p <- ggplot(data = G_long_filtered, mapping = aes(x = id, y = Count, fill = Health_state)) +
geom_bar(stat = "identity", position = "stack") +
ylim(-150, 500) +
# Добавляем кастомную тему
theme_minimal(base_size = 8) +
theme(axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
plot.margin = unit(rep(-1, 6), "cm")) +
coord_polar(start = 0)+
geom_text(data = labels_data,
aes(x = id, y = Count,
label = Taxon,
hjust = hjust),
color = "black",
fontface = "bold",
alpha = 0.9,
size = 1.5,
angle = labels_data$angel,
inherit.aes = FALSE) +
# Добавляем горизонтальные линии сетки
geom_hline(yintercept = seq(0, 300, by = 50),
linetype = "dotted",
color = "gray",
size = 0.5) +
# Добавляем значения рядом с линиями сетки
annotate("text", x = 1.2, y = seq(0, 300, by = 50),
label = seq(0, 300, by = 50),
color = "black",
size = 3,
alpha = 40,
hjust = 0)
p <- p + annotate("text", x = 0.5, y = 490, label = "Сотношение здоровых пациентов и пациентов с СРК",
size = 5, color = "black", fontface = "bold", hjust = 0.5)
# Сформируем имя файла для сохранения
current_filename <- paste0(filename, "_", i, ".png")
# Сохраняем график под уникальным именем
ggsave(current_filename, p, width = 10, height = 8)
print(p)
}
}
# Пример вызова функции
create_and_plot_circular_diagrams(G_long, "_G", 40, "output_plot")
create_and_plot_boxplots <- function(G_long, filter_pattern, step_size, x_var, title) {
# Получаем уникальные таксоны, удовлетворяющие условиям
unique_taxa <- Wilcox_comparison_round_2 %>%
subset(grepl(filter_pattern, Taxon)) %>%
filter(p_value_holm < 0.05) %>%
distinct(Taxon)
# Разбиваем уникальные таксоны на группы по step_size
taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
# Проходим по группам и строим боксплоты
for (i in seq_along(taxon_groups)) {
current_taxa <- taxon_groups[[i]]
G_long_filtered <- G_long %>%
filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
# Создаем формулу для переменной x_var
formula <- as.formula(paste("Age ~", x_var))
# Строим боксплот
box_plot <- ggplot(G_long_filtered, aes_string(x = x_var, y = "Age", fill = "Health_state")) +
geom_boxplot() +
labs(title = paste(title),
x = x_var,
y = "Age") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8), # Уменьшаем размер шрифта для подписей Taxon
axis.title.x = element_blank(), # Убираем название оси X
legend.position = "bottom") + # Перемещаем легенду вниз
scale_fill_manual(values = c("red", "skyblue")) +
facet_grid(~Taxon, scales = "free_y", space = "free") # Используем facet_grid вместо facet_wrap
# Отображаем график на экране
print(box_plot)
}
}
# Пример вызова функции с Health_state в качестве x_var
create_and_plot_boxplots(G_long, "_G", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
create_and_plot_taxon_groups("_F", 18, "F_long")
create_and_plot_circular_diagrams(F_long, "_F", 40, "F_output_plot")
create_and_plot_boxplots(F_long, "_F", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
create_and_plot_taxon_groups("_C", 18, "C_long")
create_and_plot_circular_diagrams(C_long, "_C", 40, "F_output_plot")
create_and_plot_boxplots(C_long, "_C", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
create_and_plot_taxon_groups("_O", 18, "O_long")
create_and_plot_circular_diagrams(O_long, "_O", 30, "F_output_plot")
create_and_plot_boxplots(O_long, "_O", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
create_and_plot_taxon_groups("_P", 18, "P_long")
create_and_plot_circular_diagrams(P_long, "_P", 30, "P_output_plot")
create_and_plot_boxplots(P_long, "_P", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
perform_permutation_test <- function(data, filter_pattern) {
combined_bacteria_G <- data %>%
select(Health_state, ends_with(filter_pattern))
d <- dist(combined_bacteria_G) # euclidean distances between the rows
fit <- cmdscale(d, eig=TRUE, k=2) # k is the number of dim
df_mds <- data.frame(
x = fit$points[,1],
y = fit$points[,2]
)
df_full <- cbind(df_mds, combined_info) %>%
mutate(Health_state_n = case_when(Health_state == "Health" ~ 0,
Health_state == "Disease" ~ 1))
msd_plot <- ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
geom_point() +
theme_bw() +
ggtitle("Распределение вектора таксонов в зависимости от группы пациентов")
print(msd_plot)
print("Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state")
adonis2(d ~ Health_state_n, data = df_full)
}
# Пример вызова функции с использованием другого фильтра
perform_permutation_test(combined_bacteria_clean, "_G")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 1046 0.0122 4.6817 0.004 **
## Residual 379 84659 0.9878
## Total 380 85705 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
perform_permutation_test(combined_bacteria_clean, "_F")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 1919 0.01423 5.4706 0.002 **
## Residual 379 132940 0.98577
## Total 380 134859 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
perform_permutation_test(combined_bacteria_clean, "_C")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 32982 0.12662 54.948 0.001 ***
## Residual 379 227490 0.87338
## Total 380 260472 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
perform_permutation_test(combined_bacteria_clean, "_O")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 6923 0.04767 18.97 0.001 ***
## Residual 379 138309 0.95233
## Total 380 145232 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
perform_permutation_test(combined_bacteria_clean, "_P")
## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = d ~ Health_state_n, data = df_full)
## Df SumOfSqs R2 F Pr(>F)
## Health_state_n 1 32728 0.12502 54.155 0.001 ***
## Residual 379 229042 0.87498
## Total 380 261770 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library("lme4")
library("stringr")
#Переменная имеет биномиальное распределение
plot(data_long$Серотонин)
#Процент пропущенных значений в переменной Серотонин 96,67%
na_serotonin <- sum(is.na(data_long$Серотонин)) / nrow(data_long) * 100
#Данные о серотонине есть только в F и G taxons
serotonin_taxons <- data_long %>%
filter(!is.na(`Серотонин`)) %>%
mutate(Serotonin_taxons = str_sub(Taxon, -2)) %>%
distinct(Serotonin_taxons) %>%
pull(Serotonin_taxons)
#При этом только 2 F таксона и 42 G таксона
serotonin_taxons_count_unique <- bac_functions %>%
filter(!is.na(Серотонин)) %>%
mutate(Serotonin_taxons = str_sub(Taxon, -2)) %>%
count(Serotonin_taxons)
#А всего 219 F в датасете
num_unique_F <- data_long %>%
filter(str_detect(Taxon, "_F$")) %>%
summarise(Num_Unique = n_distinct(Taxon)) %>%
pull(Num_Unique)
###Добавляем данные по родам из двух семейств в bac_functions (данные добавлены в 8-й лист Bacterial group functions.xlsx)
# Чтение листов Excel-файла с функциями бактерий и их объединение
path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")
neuromediators_1 <- read_xlsx (path, 8) %>%
mutate(Destroy = ifelse(is.na (Destroy), "produce", "destroy")) %>%
unique() %>%
pivot_wider(names_from = Neuromediator, values_from = Destroy)
probiotics <- read_xlsx (path, 3) %>%
add_column(probiotics = 1)
special_properties <- read_xlsx (path, 4) %>%
add_column(special_properties = 1)
vitamins <- read_xlsx (path, 5) %>%
pivot_wider (names_from = Vitamin, values_from = Vitamin,
values_fn = function(x) ifelse(is.na (x), NA, 1))
habbits <- read_xlsx (path, 7) %>%
unique() %>% #удаление повторяющихся строк
pivot_wider(names_from = Habbit, values_from = Habit_state)
bac_functions_1 <- read_xlsx (path, 1) %>% #Патогены и нежелательные
full_join(neuromediators_1, by = taxon) %>% #Нейромедиаторы
full_join(probiotics, by = taxon) %>% #Пробиотики
full_join(special_properties, by = taxon) %>% #С особыми свойствами
full_join(vitamins, by = taxon) %>% #Витамины
full_join(read_xlsx (path, 6), by = taxon) %>% #Продуценты КЦДК
full_join(habbits, by = taxon) %>% #Вредные привычки
unite("Taxon", TaxonName, Rank, sep = "_") %>%
filter (Taxon != "Blautia obeum_S") %>% #для данного таксона противоречивая информация в Продуценты КЦЖК
mutate_all(as.factor)
rm (path, taxon, neuromediators_1, probiotics, special_properties, vitamins, habbits)
data_long_1 <- data_wide %>%
pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
names_to = "Taxon", values_to = "Percentage")
#Перезапись data_long с добавлением функций бактерий
data_long_1 <- data_long_1 %>%
left_join (bac_functions_1, by = "Taxon")
G_long_1 <- data_long_1 %>% subset(grepl("_G", Taxon))
#Модель отдельно для таксонов G уровня
G_long_serotonin <- G_long_1 %>%
filter( ! is.na(`Серотонин`)) %>%
filter(Percentage > 0.0000)
G_long_serotonin$`Серотонин` <- relevel(G_long_serotonin$`Серотонин`, ref = "destroy")
G_long_serotonin$research_ID <- as.factor(G_long_serotonin$research_ID)
G_long_serotonin$Seq_date <- as.factor(G_long_serotonin$Seq_date)
G_long_serotonin$Seq_date <- as.factor(G_long_serotonin$Seq_date)
G_long_serotonin$patient_ID <- as.factor(G_long_serotonin$patient_ID)
model_ser_G_1 <- glmer(Health_state ~ Серотонин + (1 | research_ID), data = G_long_serotonin, family = binomial)
summary(model_ser_G_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Health_state ~ Серотонин + (1 | research_ID)
## Data: G_long_serotonin
##
## AIC BIC logLik deviance df.resid
## 8002.6 8027.1 -3998.3 7996.6 26740
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.58164 -0.00200 0.00227 0.00291 0.67902
##
## Random effects:
## Groups Name Variance Std.Dev.
## research_ID (Intercept) 482.6 21.97
## Number of obs: 26743, groups: research_ID, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.30150 1.93365 0.156 0.8761
## Серотонинproduce 0.14273 0.07275 1.962 0.0498 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Серотонинpr 0.000
Модель не показывает значимой ассоциации наличия серотонинпродуцирующих бактерий и состоянием здоровья человека (СРК/здоровый).